被动让出调度

抢占调度（goroutine因运行时间过长）。
- 抢占调度：因goroutine运行时间过长而发生的。
- goroutine因读写channel等阻塞而导致的被动调度，以及通过调用Gosched()函数发起的主动调度。

抢占标识

retake()

_Prunning，表示对应的goroutine正在运行，如果其运行时间超过了10毫秒则对需要抢占。
_Psyscall，表示对应的goroutine正在内核执行系统调用，此时需要根据多个条件来判断是否需要抢占。
该函数只在sysmon监控线程中被调用。
参数now int64：当前时间。
返回值uint32：处于系统调用中需要抢占P的数量。
文件位置：go1.19.3/src/runtime/proc.go。

// forcePreemptNS is the time slice given to a G before it is
// preempted.
// 
// forcePreemptNS 是在G被抢占之前给它的时间片。
const forcePreemptNS = 10 * 1000 * 1000 // 10ms

5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445


// 检查所有的P查看是否存在运行时间太长的G需要设置抢占请求。
//  1. goroutine运行时间超过10ms时需要抢占。
//  2. goroutine陷入系统调用，运行时间超过10ms或在第二轮来是sysmon系统调用还没返回时。
// 陷入系统调用而抢占P的情况：
//  1. 运行时间超过10ms，可能一开始就陷入系统调用，或中途陷入系统调用。不论那种情况都应该抢占。
//  2. 运行时间没到10ms，但是两轮sysmon了还是在系统调用中，需要抢占P。
func retake(now int64) uint32 {
    n := 0
    
    // 1) 锁住 allp，现在需要遍历所有的P查看是否存在运行时间过长而需要抢占的G。
    
    // Prevent allp slice changes. This lock will be completely
    // uncontended unless we're already stopping the world.
    // 
    // 防止 allp 切片更改。 除非我们已经STW，否则这把锁将是完全无人争夺的
    lock(&allpLock) // allp加锁
    
    // 2) 遍历所有的P，根据运行时间是否设置抢占标志。
    
    // We can't use a range loop over allp because we may
    // temporarily drop the allpLock. Hence, we need to re-fetch
    // allp each time around the loop.
    // 
    // 我们不能使用range来遍历allp，因为我们可能会暂时放弃allpLock锁（会暂时解锁）。
    // 因此，我们需要在每次循环中重新获取allp。
    // range会拷贝，因此增长或缩小了allp不会实时变化。
    for i := 0; i < len(allp); i++ { // 遍历所有的P
        _p_ := allp[i]
        // 2.1) 未初始化的P跳过。可能正在增长P。
        if _p_ == nil {
            // This can happen if procresize has grown
            // allp but not yet created new Ps.
            // 
            // 如果procresize已经增长了所有p，但还没有创建新的p，则可能发生这种情况。
            continue
        }
        
        // 2.2) 判断是否运行时间过长
        // _p_.sysmontick用于sysmon线程记录被监控p的系统调用时间和运行时间
        //  type sysmontick struct {
        //      schedtick   uint32      // 调度器调度次数
        //      schedwhen   int64       // 上次调度时间
        // 
        //      syscalltick uint32      // 系统调用次数
        //      syscallwhen int64       // 上次调度时间
        //  }
        pd := &_p_.sysmontick           // 与sysmon线程相关
        
        // _Prunning：对应的goroutine正在运行
        // _Psyscall：对应的goroutine正在内核执行系统调用
        s := _p_.status // P当前所处状态 _Prunning，_Psyscall
        
        // 标记当前P是否已设置抢占请求
        // false.未设置 true.已设置
        sysretake := false 
        // 2.3) 先判断 schedtick 和 schedwhen 时间是否运行时间过长。
        // G的运行时间是包括系统调用的时间的。
        if s == _Prunning || s == _Psyscall {
            // Preempt G if it's running for too long.	
            //
            // 如果G运行太久，就抢占它。
            
            // _p_.schedtick调度次数，该值是在P上的，记录当前的调度次数。
            // 注意区别sysmontick上的schedtick
            t := int64(_p_.schedtick) // _p_.schedtick：每发生一次调度，调度器++该值
            
            // pd.schedtick == t说明(pd.schedwhen～now)这段时间未发生过调度（这种情况也就是我们要处理的抢占情况），
            // 所以这段时间是同一个goroutine一直在运行，下面检查一直运行是否超过了10毫秒，否则则是发生过调度
            if int64(pd.schedtick) != t { // 如果不相等说明是一次新的调度
                // 监控线程监控到一次新的调度，所以重置跟sysmon相关的schedtick和schedwhen变量
                // 2.4) 检测到下次调度，更新调度时间
                pd.schedtick = uint32(t)
                pd.schedwhen = now
            } else if pd.schedwhen+forcePreemptNS <= now {
                // 2.4) 本次调度已超过 10ms，设置抢占标识。
                // 从某goroutine第一次被sysmon线程监控到正在运行一直运行到现在超过了10毫秒
                // 抢占用户代码的goroutine时是需要判断是否能抢占的条件的。
                preemptone(_p_) // 设置抢占请求，非系统调用时在这里后就结束了。
                
                // In case of syscall, preemptone() doesn't
                // work, because there is no M wired to P.
                // 
                // 在系统调用的情况下，preemptone()不起作用，因为M没有连接到P。此时已经陷入到系统调度中，不会响应请求。
                sysretake = true // 已标记了抢占
            }
            // 2.4) 本地调度运行时间还未到10ms。
        }
        
        // 2.5) P处于系统调用之中时。
        if s == _Psyscall {
            // Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
            // 
            // 如果P存在超过1个sysmon tick(至少20us)，则从sycall中重新取P。
            // _p_.syscalltick用于记录系统调用的次数，主要由工作线程在完成系统调用之后++
            t := int64(_p_.syscalltick)	
            // sysretake = false：前面没有设置抢占标志。
            //  1. 本轮调度G还没到达10ms。
            //  2. 新的一轮调度，已经重置了。
            // int64(pd.syscalltick) != t：新的一轮系统调度了。
            if !sysretake && int64(pd.syscalltick) != t {
                pd.syscalltick = uint32(t) // update syscalltick
                pd.syscallwhen = now // update syscallwhen
                continue
            }
            
            // 2.6) sysretake == true || (sysretake == false && int64(pd.syscalltick) == t)
            //  1. sysretake == true：前面已经设置了抢占请求，G运行时间超过了10ms，现在处于系统调用中。
            //  2. (sysretake == false && int64(pd.syscalltick) == t)：
            //      goroutine没有超过10ms，但是监控先到第二轮了，现在处于系统调用中。
            //      因此这种情况取决于监控线程的调度时间间隔。
            
            // On the one hand we don't want to retake Ps if there is no other work to do,
            // but on the other hand we want to retake them eventually
            // because they can prevent the sysmon thread from deep sleep.
            // 
            // 一方面我们不想在没有其他工作的情况下重新获取 Ps，
            // 另一方面我们希望最终重新获取它们，因为它们可以防止 sysmon 线程深度睡眠。
            // 只要满足下面三个条件中的任意一个，则抢占该p，否则不抢占
            //  1. p的运行队列里面有等待运行的goroutine。（有需要运行的goroutine，需要抢占P）
            //  2. 没有无所事事的p，也就是没有自旋的P或空闲的P。（系统很忙，需要抢占P）
            //  3. 从上一次监控线程观察到p对应的m处于系统调用之中到现在已经超过10了毫秒。（系统调用时间太长，需要抢占P）
            if runqempty(_p_) && atomic.Load(&sched.nmspinning)+atomic.Load(&sched.npidle) > 0 && pd.syscallwhen+10*1000*1000 > now {
                  // 不需要抢占：_p_本地队列为空 && 存在自旋或空闲的P（系统不忙） && 系统调用时间还没有超过了10ms
                  continue
            }
            
            // Drop allpLock so we can take sched.lock.
            // 
            // 这里是前面不能有for range的原因，解锁这段时间可能allp会发生变化。
            unlock(&allpLock)   // 解锁 allpLock
            
            // Need to decrement number of idle locked M's
            // (pretending that one more is running) before the CAS.
            // Otherwise the M from which we retake can exit the syscall,
            // increment nmidle and report deadlock.
            // 
            // 需要在CAS之前减少空闲锁定M的数量(假装还有一个正在运行)。
            // 否则，我们重新获取的M可以退出系统调用，增加nmid并报告死锁。
            incidlelocked(-1)   // sched.nmidlelocked += -1
            // 这里使用Cas修改P的使用权，原因是此时此刻正好存在系统调用返回了，也正在获取P的使用权
            // 如果使用权获取成功则调用handoffp()寻找新的工作线程来接管这个p
            // _Pidle：空闲状态。此时的P没有被用来执行用户代码或调度器代码，通常位于空闲链表中，能够被调度器获取，
            // 它的状态可能正在由空闲转变成其他状态。P的所有权归空闲链表或某个正在改变它状态的线程所有，本地runq为空。
            if atomic.Cas(&_p_.status, s, _Pidle) {
                if trace.enabled {
                    traceGoSysBlock(_p_)
                    traceProcStop(_p_)
                }
                n++
                _p_.syscalltick++ // 系统调度次数加一
                // 尝试寻找一个新的m出来接管P
                // 抢占陷入系统调用的P时，没有多余的条件
                handoffp(_p_)	
            }
            incidlelocked(1)
            lock(&allpLock)
        }
    }
    unlock(&allpLock)
    return uint32(n)
}

incidlelocked()

文件位置：go1.19.3/src/runtime/proc.go。

5505
5506
5507
5508
5509
5510
5511
5512
5513
5514


func incidlelocked(v int32) {
    lock(&sched.lock)
    // nmidlelocked 锁定等待工作的M的数量
    // 只会在该函数中加减，在checkdead()函数中判断
    sched.nmidlelocked += v
    if v > 0 {
        checkdead()
    }
    unlock(&sched.lock)
}

preemptone()

sysmon线程如果监控到某个goroutine连续运行超过了10毫秒，则会调用preemptone()函数向该goroutine发出抢占请求。
告诉在处理器P上运行的goroutine停止。
这个函数只是尽了最大努力。它可能会错误地没有通知goroutine。也可能会通知错误的goroutine。
即使它通知了正确的goroutine，如果goroutine同时执行newstack，它可能会忽略请求。
不需要锁。如果发出抢占请求，则返回true。
实际的抢占将在未来的某个时间点发生，并且将由gp->status不再是Grunning表示。设置抢占请求。
该函数会在retake()函数中调用，GC期间调用。
可以看出，preemptone函数只是简单的设置了被抢占goroutine对应的g结构体中的 preempt成员为true和stackguard0成员为stackPreempt（stackPreempt是一个常量0xfffffffffffffade，是非常大的一个数）就返回了，并未真正强制被抢占的goroutine暂停下来。
既然设置了一些抢占标志，那么就一定需要对这些标志进行处理，下面我们就来分析被抢占的goroutine如何处理这些标志去响应监控线程提出的抢占请求。
文件位置：go1.19.3/src/runtime/proc.go。

5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435


// Tell the goroutine running on processor P to stop.
// This function is purely best-effort. It can incorrectly fail to inform the
// goroutine. It can inform the wrong goroutine. Even if it informs the
// correct goroutine, that goroutine might ignore the request if it is
// simultaneously executing newstack.
// No lock needs to be held.
// Returns true if preemption request was issued.
// The actual preemption will happen at some point in the future
// and will be indicated by the gp->status no longer being
// Grunning
func preemptone(_p_ *p) bool {
    // 1) 抢占P的关联的m
    mp := _p_.m.ptr()   // mp := m
    
    // 2) 抢占的P没有绑定M，或抢占的P的M与当前运行G的M一致【不设置抢占标志】
    //  1. mp == nil：可能来自sysmon抢占空闲的P的时候，这时候P是没有绑定M的。
    //  2. mp == getg().m：抢占的是自己，很大可能这种情况来自GC在等待其他P停下来的时候。
    if mp == nil || mp == getg().m {
        return false
    }
    
    
    // 3) 抢占的工作线程刚好处理完goroutine，或抢占的工作线程正在g0中【不设置抢占标志】
    gp := mp.curg   // mp工作线程上正在运行的goroutine
    // 1. gp == nil：当前工作线程正在执行的goroutine刚好运行完被调离M时。
    // 2. gp == mp.g0：当前正在g0上，可能在执行调度代码。
    if gp == nil || gp == mp.g0 {
        return false
    }

    gp.preempt = true // 标记正在运行的P的g设置抢占标志

    // Every call in a goroutine checks for stack overflow by
    // comparing the current stack pointer to gp->stackguard0.
    // Setting gp->stackguard0 to StackPreempt folds
    // preemption into the normal stack overflow check.
    // 
    // goroutine中的每个调用都通过将当前堆栈指针与gp->stackguard0进行比较来检查堆栈溢出。
    // 设置gp->stackguard0为StackPreempt将抢占转换为正常的栈溢出检查。
    // stackPreempt是一个常量0xfffffffffffffade，是非常大的一个数。
    gp.stackguard0 = stackPreempt // 设置stackguard0使被抢占的goroutine去处理抢占请求

    // Request an async preemption of this P.
    // 
    // 请求这个P的异步抢占。这种情况是对于没有调用任何函数的goroutine，没有抢占机会的情况下。
    //  1. preemptMSupported：其中的 preemptMSupported 是个常量，因为受硬件特性的限制，
    //     在某些平台上是无法支持这种抢占的。
    //  2. debug.asyncpreemptoff：则是让用户可以通过 GODEBUG 环境变量来禁用异步抢占，
    //     默认情况下是被启用的。
    if preemptMSupported && debug.asyncpreemptoff == 0 {
        // 在P的数据结构中也新增了一个preempt字段，这里会把它设置为true。
        _p_.preempt = true
        // 实际上抢占操作是由 preemptM 函数完成的。
        preemptM(mp)	// 该函数发起异步抢占给MP发送抢占信号
    }

    return true
}

handoffp()

从系统调用中关闭P或锁定M。总是在没有P的情况下运行，因此不允许有写屏障。
handoffp()函数主要任务是通过各种条件判断是否需要启动工作线程来接管_p_，如果不需要则把_p_放入P的全局空闲队列。
1. _p_的本地运行队列或全局运行队列里面有待运行的goroutine。
2. 需要帮助gc完成标记工作。
3. 系统比较忙，所有其它_p_都在运行goroutine，需要帮忙。
4. 所有其它P都已经处于空闲状态，如果需要监控网络连接读写事件，则需要启动新的m来poll网络连接。
文件位置：go1.19.3/src/runtime/proc.go。

2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439


// Hands off P from syscall or locked M.
// Always runs without a P, so write barriers are not allowed.
//
//go:nowritebarrierrec
func handoffp(_p_ *p) {
    // handoffp must start an M in any situation where
    // findrunnable would return a G to run on _p_.
    //
    // 在findrunnable返回G并在_p_上运行的任何情况下，handffp必须开始一个M。

    // if it has local work, start it straight away
    // 
    // 如果它有本地工作，需要启动m来接管
    if !runqempty(_p_) || sched.runqsize != 0 {
        startm(_p_, false) // 创建M来接管P
        return
    }
    // if there's trace work to do, start it straight away
    if (trace.enabled || trace.shutdown) && traceReaderAvailable() {
        startm(_p_, false)
        return
    }
    // if it has GC work, start it straight away
    // 
    // GC正在工作，也需要启动m来接管
    if gcBlackenEnabled != 0 && gcMarkWorkAvailable(_p_) {
        startm(_p_, false)
        return
    }
    // no local work, check that there are no spinning/idle M's,
    // otherwise our help is not required
    // 
    // 没有本地工作，检查是否有 spinning/idle 的M，否则不需要我们的帮助。
    //  1. atomic.Load(&sched.nmspinning)+atomic.Load(&sched.npidle) == 0：没有自旋的M和空闲的P时。
    //  2. atomic.Cas(&sched.nmspinning, 0, 1)：sched.nmspinning = 1。
    if atomic.Load(&sched.nmspinning)+atomic.Load(&sched.npidle) == 0 && atomic.Cas(&sched.nmspinning, 0, 1) { // TODO: fast atomic
        startm(_p_, true) // 这时候启动的M绑定P可以起去其他P中偷取任务，如果存在空闲的P则表示其他P不忙
        return
    }
    lock(&sched.lock)
    // GC正在STW等待时。
    if sched.gcwaiting != 0 {
        _p_.status = _Pgcstop // 修改状态为GC而停下
        sched.stopwait--      // 因为GC而停下来
        // 当前STW要求的P全部停下来时，就可以唤醒等待在sched.stopnote上的发起STW的线程了。
        if sched.stopwait == 0 {
            notewakeup(&sched.stopnote)
        }
        unlock(&sched.lock)
        return
    }
    if _p_.runSafePointFn != 0 && atomic.Cas(&_p_.runSafePointFn, 1, 0) {
        sched.safePointFn(_p_)
        sched.safePointWait--
        if sched.safePointWait == 0 {
            notewakeup(&sched.safePointNote)
        }
    }
    // 全局队列池有G需要处理时。
    if sched.runqsize != 0 {
        unlock(&sched.lock)
        startm(_p_, false)
        return
    }
    // If this is the last running P and nobody is polling network,
    // need to wakeup another M to poll network.
    // 
    // 如果这是最后一个运行的P并且没有其他线程在阻塞式等待netpoll，需要唤醒一个M来处理netpoll。
    //  1. sched.npidle == uint32(gomaxprocs-1)：当前是最后一个空闲P
    //  2. atomic.Load64(&sched.lastpoll) != 0：没有其他线程在阻塞式访问netpoll。
    if sched.npidle == uint32(gomaxprocs-1) && atomic.Load64(&sched.lastpoll) != 0 {
        unlock(&sched.lock)
        startm(_p_, false)
        return
    }

    // The scheduler lock cannot be held when calling wakeNetPoller below
    // because wakeNetPoller may call wakep which may call startm.
    // 
    // 当调用wakeNetPoller时，调度器锁不能保持，因为wakeNetPoller可能会调用wakeep，而后者可能会调用startm。
    when := nobarrierWakeTime(_p_) // 最新timer触发时间点
    pidleput(_p_, 0) //无事可做，把p放入全局空闲队列
    unlock(&sched.lock)

    if when != 0 {
        wakeNetPoller(when)
    }
    
    // 走到这里不会抢占P
}

响应抢占请求

抢占的相关函数调用链morestack_noctxt()->morestack()->newstack()。
从源代码中morestack()函数的注释可以知道，该函数会被编译器自动插入到函数 序言(prologue) 中。

morestack_noctxt()

文件位置：go1.19.3/src/runtime/asm_amd64.s。

574
575
576
577


# morestack but not preserving ctxt.
TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
    MOVL    $0, DX              # DX = 0，DX寄存器被用作函数调用的隐藏传值
    JMP	runtime·morestack(SB)   # 注意这里使用的是JMP不是CALL因此不是函数调用

我们假设是在main.main函数序言中调用了morestack_noctxt()函数，则函数的栈帧结构如下：

//  +10 | 
//      ----------------------------    runtime.main SP
//  +08 | runtime.main callback
//      ----------------------------    main.main SP
//  +00 | main.main callback
//      ----------------------------    morestack_noctxt SP
//
//      runtime·morestack(SB)是通过JMP调用的，所以没有重新分配栈帧

morestack()

当需要更多栈时，在函数prolog期间调用。
回溯例程将g0上的morestack视为栈的顶部(例如，morestack调用newstack调用调度器调用newm调用gc)，
因此我们必须记录参数大小。为此，它没有参数。
文件位置：go1.19.3/src/runtime/asm_amd64.s。
该函数，保护调用者信息，切换到g0栈调用runtime·newstack方法。

269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359


# Called during function prolog when more stack is needed.
#
# The traceback routines see morestack on a g0 as being
# the top of a stack (for example, morestack calling newstack
# calling the scheduler calling newm calling gc), so we must
# record an argument size. For that purpose, it has no arguments.
TEXT runtime·morestack(SB),NOSPLIT,$0-0
    # Cannot grow scheduler stack (m->g0).
    get_tls(CX)             # CX = &m.tls[1]
    MOVQ    g(CX), BX       # BX = m.tls[0] = g
    MOVQ    g_m(BX), BX     # BX = g.m
    MOVQ    m_g0(BX), SI    # SI = m.g0
    CMPQ    g(CX), SI       # 比较当前g是否是g0
    JNE	3(PC) # 判断不为零时则跳转
    # runtime·badmorestackg0 错误信息 "morestack()函数在g0栈上被调用"
    CALL    runtime·badmorestackg0(SB)
    # int 3 进入中断指令，这是一个调试指令
    CALL    runtime·abort(SB)

    # Cannot grow signal stack (m->gsignal).
    MOVQ    m_gsignal(BX), SI   # SI = m.gsignal
    CMPQ    g(CX), SI
    JNE	3(PC) # 判断不为零时则跳转
    # runtime·badmorestackgsignal 错误信息 "morestack()函数在gsignal上被调用"
    CALL    runtime·badmorestackgsignal(SB)
    CALL    runtime·abort(SB)

    # Called from f.
    # Set m->morebuf to f's caller.
    # 
    # 从f调用。设置m->morebuf为f的调用者。
    # NOP SP 指令意义：
    #   1. NOP SP指令不做任何操作。具体来说，它会将堆栈指针(SP)向后移动0个字节，这实际上是没有任何效果的。
    #   2. 在 Go 的汇编语言中，有时需要使用 "NOP SP" 这条指令来进行指令对齐，但这也可能会导致 vet 工具产生误报，因为它会认为这会导致堆栈偏移量的改变。
    # "# tell vet SP changed - stop checking offsets" 这行注释的意义：
    #   1. 为了避免这种情况，程序员可以添加这个注释来告诉vet工具，实际上没有对堆栈偏移量进行任何更改，因此vet工具可以停止检查堆栈偏移量。
    #   2. 这句话的意思是，程序员在添加NOP SP指令时遇到了vet工具的误报问题，为了解决这个问题，他们添加了这个注释，告诉vet工具不需要继续检查堆栈偏移量。
    # 指令对齐：是指将指令地址对齐到一定的边界上，使得指令的执行效率更高。
    #   1. 在计算机系统中，CPU 通常需要从内存中读取指令并执行它们，这是一个非常耗时的过程。
    #   2. 为了提高执行效率，CPU 需要在访问内存时保持一定的对齐方式，以便更快地读取指令并进行处理。
    #   3. 在指令对齐中，指令地址通常被要求对齐到一个特定的边界，通常是2的幂次方，如2、4、8等。
    #   4. 这意味着指令地址的低位必须是0，这使得 CPU 可以更快地读取指令并进行处理，从而提高程序的执行效率。
    #   5. 在编写汇编语言程序时，程序员通常需要手动对指令进行对齐。这可以通过添加一些无操作指令，如NOP指令，来实现。
    #   6. 这些指令不会对程序的执行产生任何影响，只是用来填充指令流中的空隙，以确保指令地址对齐。
    #   7. 这些操作可以帮助 CPU 更快地读取指令并提高程序的执行效率。
    NOP	SP	# tell vet SP changed - stop checking offsets
    
    # 以下代码保存调用者信息，比如在main.main的序言中调了morestack_noctxt()->morestack()函数，需要保存的是main.main的信息
    
    # 8(SP)：main函数在调用morestack_noctxt之前的rsp寄存器
    # 通过上面函数栈帧的分配 8(SP) 是runtime.main函数的返回地址，注意这里是保存在m上的，m->morebuf
    # 保存到m->morebuf用于提供给接下来的newstack()函数使用
    MOVQ    8(SP), AX	# f's caller's PC; 
    MOVQ    AX, (m_morebuf+gobuf_pc)(BX)    # m.morebuf.gobuf.pc=AX
    # 16(SP)：调用者函数的SP，也就是runtime.main的SP寄存器地址，注意这里是 LEAQ 指令
    LEAQ    16(SP), AX	# f's caller's SP   # AX = 16(SP); 该值是runtime.main函数的rsp寄存器存储的地址
    MOVQ    AX, (m_morebuf+gobuf_sp)(BX)    # m.morebuf.gobuf.sp=AX
    get_tls(CX)         # CX = &m.tls[1]
    MOVQ    g(CX), SI   # SI = m.tls[0] = g;    这里是g，不是g0
    MOVQ    SI, (m_morebuf+gobuf_g)(BX) # m.morebuf.gobuf.g = g
    # 到这里我们已经在m->morebuf保存好了调用者runtime.main的rip、rsp、g相关信息

    # Set g->sched to context in f.
    # 
    # 将 g->sched 设置为f的上下文，这才是需要恢复的现场数据
    
    # SP栈顶寄存器现在指向的是morestack_noctxt函数的返回地址，注意下面都是保存在g上的，g->sched不是g0上
    # 0(SP)：通过上面函数栈帧的分配 0(SP) 是main.main函数的返回地址，也就是rip中的值就是main.main的下条代码地址
    MOVQ    0(SP), AX   # f's PC
    # g.sched.gobuf.pc = AX
    MOVQ    AX, (g_sched+gobuf_pc)(SI)  # 执行完morestack_noctxt函数之后应该返回去继续执行指令的地址 AX
    # 8(SP)：调用者函数的SP，也就是main.main的SP寄存器地址，这个地址是没有压入rip指令数据前的地址，注意这里是 LEAQ 指令
    LEAQ    8(SP), AX                   # f's SP;
    MOVQ    AX, (g_sched+gobuf_sp)(SI)  # g.sched.gobuf.sp = AX
    # 由于BP寄存器的值一致没有变，所以这里BP寄存器还是指向main.main的栈底
    MOVQ    BP, (g_sched+gobuf_bp)(SI)  # g.sched.gobuf.bp = BP
    # DX寄存器被设置为了0，在runtime·morestack_noctxt()函数中
    MOVQ    DX, (g_sched+gobuf_ctxt)(SI)# g.sched.gobuf.ctxt = DX;
    # 到这里当前g->sched已保存好了恢复到main.main的现场，包括rip、rsp、rbp、rdx

    # Call newstack on m->g0's stack.
    # 切换到g0栈，并设置tls的g为g0
    MOVQ    m_g0(BX), BX                # BX = g0
    # 设置TLS中的g为g0
    MOVQ    BX, g(CX)                   # m.tls[0] = g0
    # 把g0栈的栈顶寄存器的值恢复到CPU的寄存器，达到切换栈的目的，下面这一条指令执行之前，
    # CPU还是使用的调用此函数的g的栈，执行之后CPU就开始使用g0的栈了
    MOVQ    (g_sched+gobuf_sp)(BX), SP  # rsp = g0.sched.gobuf.sp
    CALL    runtime·newstack(SB)        # 调用 newstack() 函数
    CALL    runtime·abort(SB)           # crash if newstack returns
    RET

汇编语言"int 3"是一个中断指令，它向操作系统发出一个调试信号，要求在程序的当前位置停止执行并进入调试器。
通常，调试器会在此处暂停程序的执行，并允许程序员检查程序状态、变量值和程序流程等信息，以帮助他们调试程序。

TEXT runtime·abort(SB),NOSPLIT,$0-0
    INT	$3
loop:
    JMP	loop

newstack()

该函数主要有两个职责：一个是【扩栈】，另一个是响应sysmon提出的【抢占请求】。
newstack()函数首先检查g.stackguard0是否被设置为stackPreempt，如果是则表示sysmon已经发现我们运行得太久了并对我们发起了抢占请求。
当需要更多堆栈时从runtime·morestack调用。分配更大的堆栈并重新定位到新堆栈。对于固定的平摊代价，堆栈增长是乘法的。
g->atomicstatus将在进入时进行Grunning或Gscanrunning。调度程序试图停止这个g，然后它将设置preemptStop。
这必须是nowritebarrierrec，因为它可以作为堆栈增长的一部分从其他nowritebarrierrec函数调用，但编译器不会检查这一点。
go:nowritebarrierrec：编译器不插入写屏障相关代码，包括当前函数以及调用的任何函数中。
文件位置：go1.19.3/src/runtime/stack.go。

 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142


// Called from runtime·morestack when more stack is needed.
// Allocate larger stack and relocate to new stack.
// Stack growth is multiplicative, for constant amortized cost.
//
// g->atomicstatus will be Grunning or Gscanrunning upon entry.
// If the scheduler is trying to stop this g, then it will set preemptStop.
//
// This must be nowritebarrierrec because it can be called as part of
// stack growth from other nowritebarrierrec functions, but the
// compiler doesn't check this.
//
//go:nowritebarrierrec
func newstack() {
    thisg := getg() // thisg = g0; 根据morestack()函数的相关代码
    // TODO: double check all gp. shouldn't be getg().
    // 
    // 根据morestack()函数的相关代码，这里thisg.m.morebuf.g.ptr()是g不是g0
    if thisg.m.morebuf.g.ptr().stackguard0 == stackFork {
        throw("stack growth after fork")
    }
    
    // m->curg 是当前m上正在运行的g
    if thisg.m.morebuf.g.ptr() != thisg.m.curg {
        print("runtime: newstack called from g=", hex(thisg.m.morebuf.g), "\n"+"\tm=", thisg.m, " m->curg=", thisg.m.curg, " m->g0=", thisg.m.g0, " m->gsignal=", thisg.m.gsignal, "\n")
        morebuf := thisg.m.morebuf
        traceback(morebuf.pc, morebuf.sp, morebuf.lr, morebuf.g.ptr())
        throw("runtime: wrong goroutine in newstack")
    }

    gp := thisg.m.curg  // gp 在这里例子是runtime.main的goroutine

    // g.throwsplit 在系统调用前会被设置为true或其他地方。因此g出现在这里不合适。
    if thisg.m.curg.throwsplit {
        // Update syscallsp, syscallpc in case traceback uses them.
        morebuf := thisg.m.morebuf
        gp.syscallsp = morebuf.sp
        gp.syscallpc = morebuf.pc
        pcname, pcoff := "(unknown)", uintptr(0)
        f := findfunc(gp.sched.pc)
        if f.valid() {
            pcname = funcname(f)
            pcoff = gp.sched.pc - f.entry()
        }
        print("runtime: newstack at ", pcname, "+", hex(pcoff),
            " sp=", hex(gp.sched.sp), " stack=[", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n",
            "\tmorebuf={pc:", hex(morebuf.pc), " sp:", hex(morebuf.sp), " lr:", hex(morebuf.lr), "}\n",
            "\tsched={pc:", hex(gp.sched.pc), " sp:", hex(gp.sched.sp), " lr:", hex(gp.sched.lr), " ctxt:", gp.sched.ctxt, "}\n")

        thisg.m.traceback = 2 // Include runtime frames
        traceback(morebuf.pc, morebuf.sp, morebuf.lr, gp)
        throw("runtime: stack split at bad time")
    }

    // m.morebuf 在上面的morestack函数中被设置为调用函数的相关信息。
    morebuf := thisg.m.morebuf
    thisg.m.morebuf.pc = 0
    thisg.m.morebuf.lr = 0
    thisg.m.morebuf.sp = 0
    thisg.m.morebuf.g = 0

    // NOTE: stackguard0 may change underfoot, if another thread
    // is about to try to preempt gp. Read it just once and use that same
    // value now and below.
    // 
    // 注意：如果另一个线程即将尝试抢占gp，stackguard0可能会在脚下发生变化。
    // 只需阅读一次并在现在和下面使用相同的值
    stackguard0 := atomic.Loaduintptr(&gp.stackguard0)  // 获取gp.stackguard0

    // Be conservative about where we preempt.
    // We are interested in preempting user Go code, not runtime code.
    // If we're holding locks, mallocing, or preemption is disabled, don't
    // preempt.
    // This check is very early in newstack so that even the status change
    // from Grunning to Gwaiting and back doesn't happen in this case.
    // That status change by itself can be viewed as a small preemption,
    // because the GC might change Gwaiting to Gscanwaiting, and then
    // this goroutine has to wait for the GC to finish before continuing.
    // If the GC is in some way dependent on this goroutine (for example,
    // it needs a lock held by the goroutine), that small preemption turns
    // into a real deadlock.
    preempt := stackguard0 == stackPreempt  // 判断当前是否真需要被抢占
    if preempt {
        // canPreemptM -> mp.locks == 0 && mp.mallocing == 0 && mp.preemptoff == "" && mp.p.ptr().status == _Prunning
        if !canPreemptM(thisg.m) {  // canPreemptM(thisg.m); true.可以抢占; false.不允许抢占
            // 以下是【不允许】抢占时，再次恢复gp。
            
            // Let the goroutine keep running for now.
            // gp->preempt is set, so it will be preempted next time.
            // 
            // 现在让goroutine继续运行。gp->preempt已设置，因此下次将被抢占。
            // (gp->preempt在前面已被设置为true)
            // 还原stackguard0为正常值，表示我们已经处理过抢占请求了
            gp.stackguard0 = gp.stack.lo + _StackGuard 
            // 恢复gp，这里永远不会返回
            gogo(&gp.sched) // never return
        }
    }

    if gp.stack.lo == 0 {
        throw("missing stack in newstack")
    }
    sp := gp.sched.sp
    if goarch.ArchFamily == goarch.AMD64 || goarch.ArchFamily == goarch.I386 || goarch.ArchFamily == goarch.WASM {
        // The call to morestack cost a word.
        sp -= goarch.PtrSize
    }
    if stackDebug >= 1 || sp < gp.stack.lo {
        print("runtime: newstack sp=", hex(sp), " stack=[", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n",
            "\tmorebuf={pc:", hex(morebuf.pc), " sp:", hex(morebuf.sp), " lr:", hex(morebuf.lr), "}\n",
            "\tsched={pc:", hex(gp.sched.pc), " sp:", hex(gp.sched.sp), " lr:", hex(gp.sched.lr), " ctxt:", gp.sched.ctxt, "}\n")
    }
    if sp < gp.stack.lo {
        print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->status=", hex(readgstatus(gp)), "\n ")
        print("runtime: split stack overflow: ", hex(sp), " < ", hex(gp.stack.lo), "\n")
        throw("runtime: split stack overflow")
    }

    // 判断抢占，发起抢占
    if preempt {
        if gp == thisg.m.g0 {
            throw("runtime: preempt g0")
        }
        if thisg.m.p == 0 && thisg.m.locks == 0 {
            throw("runtime: g is running but p is not")
        }

        if gp.preemptShrink {
            // We're at a synchronous safe point now, so
            // do the pending stack shrink.
            gp.preemptShrink = false
            shrinkstack(gp)
        }

        // 停止抢占，开启下一次调度循环,makeroot期间改值会被设置为true。
        if gp.preemptStop {	
            preemptPark(gp) // never returns
        }

        // Act like goroutine called runtime.Gosched.
        // 
        // 像调用 runtime.Gosched 的 goroutine 一样
        // 调用gopreempt_m把gp切换出去，抢占这个goroutine成功了
        gopreempt_m(gp) // never return
    }

    // 下面代码是扩大栈相关代码
    
    // Allocate a bigger segment and move the stack.
    oldsize := gp.stack.hi - gp.stack.lo
    newsize := oldsize * 2 // 扩大为原来的2倍

    // Make sure we grow at least as much as needed to fit the new frame.
    // (This is just an optimization - the caller of morestack will
    // recheck the bounds on return.)
    if f := findfunc(gp.sched.pc); f.valid() {
        max := uintptr(funcMaxSPDelta(f))
        needed := max + _StackGuard
        used := gp.stack.hi - gp.sched.sp
        for newsize-used < needed {
            newsize *= 2
        }
    }

    if stackguard0 == stackForceMove {
        // Forced stack movement used for debugging.
        // Don't double the stack (or we may quickly run out
        // if this is done repeatedly).
        newsize = oldsize
    }

    if newsize > maxstacksize || newsize > maxstackceiling {
        if maxstacksize < maxstackceiling {
            print("runtime: goroutine stack exceeds ", maxstacksize, "-byte limit\n")
        } else {
            print("runtime: goroutine stack exceeds ", maxstackceiling, "-byte limit\n")
        }
        print("runtime: sp=", hex(sp), " stack=[", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n")
        throw("stack overflow")
    }

    // The goroutine must be executing in order to call newstack,
    // so it must be Grunning (or Gscanrunning).
    casgstatus(gp, _Grunning, _Gcopystack)

    // The concurrent GC will not scan the stack while we are doing the copy since
    // the gp is in a Gcopystack status.
    copystack(gp, newsize)
    if stackDebug >= 1 {
        print("stack grow done\n")
    }
    casgstatus(gp, _Gcopystack, _Grunning)
    gogo(&gp.sched) // 再次恢复这个goroutine
}

canPreemptM

canPreemptM报告mp是否处于可以安全抢占的状态。
它是nosplit因为它有nosplit的调用者。
go:nosplit：告诉编译器不要在当前函数中插入任何栈扩展代码，这样可以确保当前函数不会导致栈的大小发生变化。
1. 在Go语言中，每个goroutine都有一个固定的栈大小，当栈的大小不足以容纳当前函数的执行时，就会发生栈溢出错误。
2. 因此，使用"go:nosplit"指令可以确保函数的执行不会导致栈的大小发生变化，从而避免栈溢出错误的发生。这个指令通常用于一些关键性的函数中，比如垃圾回收器和调度器等。
3. 需要注意的是，使用"go:nosplit"指令可能会影响程序的性能。因为不再插入栈扩展代码，这意味着在执行函数时，栈的大小不会动态调整。因此，程序员需要在使用"go:nosplit"指令时仔细考虑性能和栈溢出错误之间的权衡。
文件位置：go1.19.3/src/runtime/preempt.go。

282
283
284
285
286
287
288
289
290
291
292
293
294
295


// canPreemptM reports whether mp is in a state that is safe to preempt.
//
// It is nosplit because it has nosplit callers.
//
//go:nosplit
func canPreemptM(mp *m) bool {
    // 能否抢占条件：true.能抢占，false.不能抢占。
    //  1. mp.locks == 0：表示当前goroutine持有的互斥锁数量，没到0时，不应该被抢占。
    //  2. mp.mallocing == 0：当前goroutine正在分配内存，不应该被抢占。
    //  3. mp.preemptoff：如果该值被设置为非空字符串，则表示当前goroutine不应该被抢占。
    //  4. mp.p.ptr().status == _Prunning：当前P正在运行中。
    // 满足以上条件则能抢占g。该函数也会在信号抢占函数isAsyncPreempt()函数中调用，用于判断是否允许抢占
    return mp.locks == 0 && mp.mallocing == 0 && mp.preemptoff == "" && mp.p.ptr().status == _Prunning
}

gopreempt_m

抢占调度，后逻辑和runtime.Gosched一样。
文件位置：go1.19.3/src/runtime/proc.go。

3402
3403
3404
3405
3406
3407


func gopreempt_m(gp *g) {
    if trace.enabled {
        traceGoPreempt()
    }
    goschedImpl(gp)
}

系统调用前后

handoffp()，对正在进行系统调用的goroutine的抢占实质上是剥夺与其对应的工作线程所绑定的p。
虽然说处于系统调用之中的工作线程并不需要p，但一旦从操作系统内核返回到用户空间之后就必须绑定一个p才能运行go代码。

系统调用

Syscall6()

系统调用时最终会调用该汇编函数。
文件位置：go1.19.3/src/syscall/asm_unix_amd64.s。
函数原型：func Syscall6(num, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, errno uintptr)。

10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36


TEXT    ·Syscall6(SB),NOSPLIT,$0-80
    # 调用 runtime.entersyscall 函数，保存现场解除绑定关系
    CALL    runtime·entersyscall<ABIInternal>(SB)
    # 系统调用参数，按照linux系统约定寄存器并调用SYSCALL执行进入内核。
    # 系统调用编号放入AX。
    MOVQ    trap+0(FP), AX  # syscall entry
    MOVQ    a1+8(FP), DI
    MOVQ    a2+16(FP), SI
    MOVQ    a3+24(FP), DX
    MOVQ    a4+32(FP), R10
    MOVQ    a5+40(FP), R8
    MOVQ    a6+48(FP), R9
    SYSCALL # 进入内核
    # 从内核返回，判断标识是否跳转
    JCC	ok6
    MOVQ    $-1, r1+56(FP)  # r1
    MOVQ    $0, r2+64(FP)   # r2
    MOVQ    AX, err+72(FP)  # errno
    CALL    runtime·exitsyscall<ABIInternal>(SB)
    RET
ok6:
    # 系统调用返回的值保存栈
    MOVQ    AX, r1+56(FP)   # r1
    MOVQ    DX, r2+64(FP)   # r2
    MOVQ    $0, err+72(FP)  # errno
    CALL    runtime·exitsyscall<ABIInternal>(SB)
    RET

系统调用前

entersyscall()

go系统调用库和普通cgo调用使用的标准系统调用项。
这是通过syscall包和x/sys中的链接名导出到程序集的。
文件位置：go1.19.3/src/runtime/proc.go。

3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682


// Standard syscall entry used by the go syscall library and normal cgo calls.
//
// This is exported via linkname to assembly in the syscall package and x/sys.
//
//go:nosplit
//go:linkname entersyscall
func entersyscall() {
    // getcallerpc()：调用者当前PC值。
    // getcallersp()：调用者当前SP值。
    reentersyscall(getcallerpc(), getcallersp())
}

reentersyscall()

文件位置：go1.19.3/src/runtime/proc.go。

3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688


// The goroutine g is about to enter a system call.
// Record that it's not using the cpu anymore.
// This is called only from the go syscall library and cgocall,
// not from the low-level system calls used by the runtime.
//
// Entersyscall cannot split the stack: the save must
// make g->sched refer to the caller's stack segment, because
// entersyscall is going to return immediately after.
//
// Nothing entersyscall calls can split the stack either.
// We cannot safely move the stack during an active call to syscall,
// because we do not know which of the uintptr arguments are
// really pointers (back into the stack).
// In practice, this means that we make the fast path run through
// entersyscall doing no-split things, and the slow path has to use systemstack
// to run bigger things on the system stack.
//
// reentersyscall is the entry point used by cgo callbacks, where explicitly
// saved SP and PC are restored. This is needed when exitsyscall will be called
// from a function further up in the call stack than the parent, as g->syscallsp
// must always point to a valid stack frame. entersyscall below is the normal
// entry point for syscalls, which obtains the SP and PC from the caller.
//
// Syscall tracing:
// At the start of a syscall we emit traceGoSysCall to capture the stack trace.
// If the syscall does not block, that is it, we do not emit any other events.
// If the syscall blocks (that is, P is retaken), retaker emits traceGoSysBlock;
// when syscall returns we emit traceGoSysExit and when the goroutine starts running
// (potentially instantly, if exitsyscallfast returns true) we emit traceGoStart.
// To ensure that traceGoSysExit is emitted strictly after traceGoSysBlock,
// we remember current value of syscalltick in m (_g_.m.syscalltick = _g_.m.p.ptr().syscalltick),
// whoever emits traceGoSysBlock increments p.syscalltick afterwards;
// and we wait for the increment before emitting traceGoSysExit.
// Note that the increment is done even if tracing is not enabled,
// because tracing can be enabled in the middle of syscall. We don't want the wait to hang.
//
//go:nosplit
func reentersyscall(pc, sp uintptr) {
    // user goroutine
    _g_ := getg() // 执行系统调用的goroutine

    // Disable preemption because during this function g is in Gsyscall status,
    // but can have inconsistent g->sched, do not let GC observe it.
    // 
    // 禁用抢占，因为在这个功能期间g处于Gsyscall状态，但可能有不一致的g->sched，不要让GC观察它。
    _g_.m.locks++

    // Entersyscall must not call any function that might split/grow the stack.
    // (See details in comment above.)
    // Catch calls that might, by replacing the stack guard with something that
    // will trip any stack check and leaving a flag to tell newstack to die.
    _g_.stackguard0 = stackPreempt // 设置抢占，在调用返回时会修改回来
    // 不能扩展栈，在调用返回时会修改回来
    _g_.throwsplit = true

    // Leave SP around for GC and traceback.
    save(pc, sp) // 保存g的现场信息，rsp，rbp，rip等
    _g_.syscallsp = sp
    _g_.syscallpc = pc
    // 监控线程依赖_Gsyscall状态实施系统调用时的抢占
    casgstatus(_g_, _Grunning, _Gsyscall) // 切换g状态为系统调用中
    // SP是否在goroutine的栈范围内
    if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
        systemstack(func() {
            print("entersyscall inconsistent ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
            throw("entersyscall")
        })
    }

    if trace.enabled {
        systemstack(traceGoSysCall)
        // systemstack itself clobbers g.sched.{pc,sp} and we might
        // need them later when the G is genuinely blocked in a
        // syscall
        save(pc, sp)
    }

    // sysmon 监控线程正挂起在 sched.sysmonwait
    if atomic.Load(&sched.sysmonwait) != 0 {
        // 切换到g0栈调用 entersyscall_sysmon 函数
        // entersyscall_sysmon 函数唤醒 sysmon 监控线程
        systemstack(entersyscall_sysmon)
        save(pc, sp)
    }

    if _g_.m.p.ptr().runSafePointFn != 0 {
        // runSafePointFn may stack split if run on this stack
        systemstack(runSafePointFn)
        save(pc, sp)
    }

    // 把P的调用次数拷贝给M
    _g_.m.syscalltick = _g_.m.p.ptr().syscalltick
    _g_.sysblocktraced = true
    // M和P相互解除关联，并把P暂存与m.oldp中，
    // 等待系统调用完后使用
    // 解除p.m关联的m
    pp := _g_.m.p.ptr() // pp = p
    pp.m = 0
    // m.oldp = pp
    _g_.m.oldp.set(pp)
    // 解除 m.p 的关系 p
    _g_.m.p = 0
    atomic.Store(&pp.status, _Psyscall) // pp.status = _Psyscall
    // STW正在等待时
    if sched.gcwaiting != 0 {
        // 切换到g0栈调用entersyscall_gcwait函数
        // entersyscall_gcwait函数，将P状态设置为 _Pgcstop，如果STW已完成则唤醒在sched.stopnote上等待的STW发起的线程。
        systemstack(entersyscall_gcwait)
        save(pc, sp)
    }

    _g_.m.locks-- 
}

save()

保存goroutine现场。
文件位置：go1.19.3/src/runtime/proc.go。

3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573


// save updates getg().sched to refer to pc and sp so that a following
// gogo will restore pc and sp.
//
// save must not have write barriers because invoking a write barrier
// can clobber getg().sched.
//
//go:nosplit
//go:nowritebarrierrec
func save(pc, sp uintptr) {
    gp := getg()

    if gp == gp.m.g0 || gp == gp.m.gsignal {
        // m.g0.sched is special and must describe the context
        // for exiting the thread. mstart1 writes to it directly.
        // m.gsignal.sched should not be used at all.
        // This check makes sure save calls do not accidentally
        // run in contexts where they'd write to system g's.
        throw("save on system g not allowed")
    }

    gp.sched.pc = pc
    gp.sched.sp = sp
    gp.sched.lr = 0
    gp.sched.ret = 0
    // We need to ensure ctxt is zero, but can't have a write
    // barrier here. However, it should always already be zero.
    // Assert that.
    if gp.sched.ctxt != nil {
        badctxt()
    }
}

系统调用后

exitsyscall()

这个goroutine g退出系统调用。安排它再次在cpu上运行。
这仅从go系统调用库中调用，而不是从运行时使用的低级系统调用中调用。
写屏障是不被允许的，因为我们的P可能被偷了。
文件位置：go1.19.3/src/runtime/proc.go。

3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866


// The goroutine g exited its system call.
// Arrange for it to run on a cpu again.
// This is called only from the go syscall library, not
// from the low-level system calls used by the runtime.
//
// Write barriers are not allowed because our P may have been stolen.
//
// This is exported via linkname to assembly in the syscall package.
//
//go:nosplit
//go:nowritebarrierrec
//go:linkname exitsyscall
func exitsyscall() {
    // user goroutine
    _g_ := getg() // goroutine g

    _g_.m.locks++ // see comment in entersyscall
    if getcallersp() > _g_.syscallsp {
        throw("exitsyscall: syscall frame is no longer valid")
    }
    
    // g.waitsince，g被阻塞的大约时间
    _g_.waitsince = 0 
    // 进入系统调用之前所绑定的p
    oldp := _g_.m.oldp.ptr()
    _g_.m.oldp = 0
    // exitsyscallfast 尝试绑定P，成功返回true，失败返回false。
    if exitsyscallfast(oldp) {
        // When exitsyscallfast returns success, we have a P so can now use
        // write barriers
        if goroutineProfile.active {
            // Make sure that gp has had its stack written out to the goroutine
            // profile, exactly as it was when the goroutine profiler first
            // stopped the world.
            systemstack(func() {
                tryRecordGoroutineProfileWB(_g_)
            })
        }
        if trace.enabled {
            if oldp != _g_.m.p.ptr() || _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
                systemstack(traceGoStart)
            }
        }
        // There's a cpu for us, so we can run.
        _g_.m.p.ptr().syscalltick++
        // We need to cas the status and scan before resuming...
        casgstatus(_g_, _Gsyscall, _Grunning)

        // Garbage collector isn't running (since we are),
        // so okay to clear syscallsp.
        _g_.syscallsp = 0
        _g_.m.locks--
        if _g_.preempt {
            // restore the preemption request in case we've cleared it in newstack
            // 恢复抢占请求，以防我们在newstack中清除了它
            _g_.stackguard0 = stackPreempt
        } else {
            // otherwise restore the real _StackGuard, we've spoiled it in entersyscall/entersyscallblock
            // 否则恢复真正的_StackGuard，我们已经在entersyscall/entersyscallblock中破坏了它
            _g_.stackguard0 = _g_.stack.lo + _StackGuard
        }
        _g_.throwsplit = false

        // sched.disable.user == true，用户goroutine被禁止运行
        // schedEnabled判断g是否是系统goroutine
        if sched.disable.user && !schedEnabled(_g_) {
            // Scheduling of this goroutine is disabled.
            Gosched() // 让出CPU，当前goroutine。
        }

        return
    }

    // M绑定P没有成功时。
    
    _g_.sysexitticks = 0
    if trace.enabled {
        // Wait till traceGoSysBlock event is emitted.
        // This ensures consistency of the trace (the goroutine is started after it is blocked).
        for oldp != nil && oldp.syscalltick == _g_.m.syscalltick {
            osyield()
        }
        // We can't trace syscall exit right now because we don't have a P.
        // Tracing code can invoke write barriers that cannot run without a P.
        // So instead we remember the syscall exit time and emit the event
        // in execute when we have a P.
        _g_.sysexitticks = cputicks()
    }

    _g_.m.locks--

    // Call the scheduler.
    // 
    // 没有绑定到p，调用mcall切换到g0栈执行exitsyscall0函数
    mcall(exitsyscall0) // mcall函数会保存现场，切换g0调用exitsyscall0函数

    // Scheduler returned, so we're allowed to run now.
    // Delete the syscallsp information that we left for
    // the garbage collector during the system call.
    // Must wait until now because until gosched returns
    // we don't know for sure that the garbage collector
    // is not running.
    _g_.syscallsp = 0
    _g_.m.p.ptr().syscalltick++
    _g_.throwsplit = false
}

exitsyscallfast()

尝试绑定一个空闲的P。true.绑定成功，false.绑定失败。
文件位置：go1.19.3/src/runtime/proc.go。

3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904


//go:nosplit
func exitsyscallfast(oldp *p) bool {
    _g_ := getg() // g

    // Freezetheworld sets stopwait but does not retake P's.
    // 
    // Freezetheworld 设置停止等待，但不重新获取P。
    // const freezeStopWait int = 0x7fffffff
    if sched.stopwait == freezeStopWait {
        return false
    }

    // Try to re-acquire the last P.
    // 
    // 试着重新获取last P。
    if oldp != nil && oldp.status == _Psyscall && atomic.Cas(&oldp.status, _Psyscall, _Pidle) {
        // There's a cpu for us, so we can run.
        // 我们有cpu，所以我们可以运行。
        wirep(oldp) // 绑定P
        exitsyscallfast_reacquired() // 处理P的syscalltick字段
        return true
    }

    // Try to get any other idle P.
    // 
    // 尝试获取一个空闲的P。
    if sched.pidle != 0 {
        var ok bool
        // 切换到g0栈
        systemstack(func() { 
            // 从全局队列中寻找空闲的p，需要加锁，比较慢
            ok = exitsyscallfast_pidle() // 搬到成功返回true，绑定失败返回false。
            if ok && trace.enabled {
                if oldp != nil {
                    // Wait till traceGoSysBlock event is emitted.
                    // This ensures consistency of the trace (the goroutine is started after it is blocked).
                    for oldp.syscalltick == _g_.m.syscalltick {
                        osyield()
                    }
                }
                traceGoSysExit(0)
            }
        })
        if ok {
            return true
        }
    }
    return false
}

exitsyscallfast_pidle()

文件位置：go1.19.3/src/runtime/proc.go。

3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927


func exitsyscallfast_pidle() bool {
    lock(&sched.lock)
    _p_, _ := pidleget(0)
    // 处理sysmon，因为在陷入到系统调用是sysmon可能自己把自己挂起，所以需要恢复
    if _p_ != nil && atomic.Load(&sched.sysmonwait) != 0 {
        atomic.Store(&sched.sysmonwait, 0)
        notewakeup(&sched.sysmonnote)
    }
    unlock(&sched.lock)
    if _p_ != nil {
        acquirep(_p_) // 绑定P如果有的话
        return true
    }
    return false
}

exitsyscall0()

exitsyscall在g0上的慢路径。获取P失败将gp放入可运行队列中。
通过mcall()调用，gp是从这个M调用g。
文件位置：go1.19.3/src/runtime/proc.go。

3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987


// exitsyscall slow path on g0.
// Failed to acquire P, enqueue gp as runnable.
//
// Called via mcall, so gp is the calling g from this M.
//
//go:nowritebarrierrec
func exitsyscall0(gp *g) {
    // 修改gp状态为_Grunnable
    casgstatus(gp, _Gsyscall, _Grunnable)
    dropg() // 解除g关联关系
    lock(&sched.lock)
    var _p_ *p
    // 判断gp是否是系统goroutine，如果是的话再次尝试获取P。
    if schedEnabled(gp) {
        _p_, _ = pidleget(0)
    }
    var locked bool
    if _p_ == nil {
        globrunqput(gp) // gp加入全局可运行队列

        // Below, we stoplockedm if gp is locked. globrunqput releases
        // ownership of gp, so we must check if gp is locked prior to
        // committing the release by unlocking sched.lock, otherwise we
        // could race with another M transitioning gp from unlocked to
        // locked.
        //
        // 下面，如果gp被锁定，我们将停止阻塞。
        // globrunqput释放了gp的所有权，所以我们必须在释放之前通过解锁sched.lock检查gp是否被锁定，否则我们可以与另一个M转换gp从解锁到锁定。
        locked = gp.lockedm != 0
    } else if atomic.Load(&sched.sysmonwait) != 0 { // 尝试唤醒sysmon，如果有
        atomic.Store(&sched.sysmonwait, 0)
        notewakeup(&sched.sysmonnote)
    }
    unlock(&sched.lock)
    if _p_ != nil {
        acquirep(_p_) // 绑定P
        // gp 被调度起来运行
        execute(gp, false) // Never returns.
    }
    if locked {
        // Wait until another thread schedules gp and so m again.
        //
        // N.B. lockedm must be this M, as this g was running on this M
        // before entersyscall.
        // 
        // 等待另一个线程调度gp，然后再调度m。
        // 注意，lockedm一定是这个M，因为这个g在entersyscall之前是在这个M上运行的。
        stoplockedm()
        execute(gp, false) // Never returns.
    }
    stopm() // 当前工作线程被挂起，等待被唤醒获取P然后运行起来
    // 调度循环开始
    schedule() // Never returns.
}

信号形式发送抢占

preemptM()

preemptM向mp发送抢占请求。该请求可以异步处理，并且可以与对M的其他请求合并。
当接收到请求时，如果正在运行的G或P被标记为抢占，并且goroutine处于异步安全点，则它将抢占 goroutine。
它总是在处理抢占请求后自动递增mp.preemptGen。
通过runtime.signalM()函数向执行M发送sigPreempt信号。
至于signalM()函数，就是调用操作系统的信号相关系统调用，将指定信号发送给目标线程。
至此，异步抢占逻辑的主要工作就算完成了前一半。
preemptM这个函数会调用signalM将在初始化的安装的_SIGURG信号发送到指定的M上。
使用 preemptM 发送抢占信号的地方主要有下面几个：
1. Go 后台监控 runtime.sysmon 检测超时发送抢占信号；
2. Go GC 栈扫描发送抢占信号；
3. Go GC STW 的时候调用 preemptall 抢占所有 P，让其暂停；
文件位置：go1.19.3/src/runtime/signal_unix.go。
参数mp *m：被抢占的P关联的M。

362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398


// preemptM sends a preemption request to mp. This request may be
// handled asynchronously and may be coalesced with other requests to
// the M. When the request is received, if the running G or P are
// marked for preemption and the goroutine is at an asynchronous
// safe-point, it will preempt the goroutine. It always atomically
// increments mp.preemptGen after handling a preemption request.
func preemptM(mp *m) {
    // On Darwin, don't try to preempt threads during exec.
    // Issue #41702.
    if GOOS == "darwin" || GOOS == "ios" {
        execLock.rlock()
    }

    // mp.signalPending: 这个M上是否有一个待处理的抢占信号。原子操作。
    if atomic.Cas(&mp.signalPending, 0, 1) {
        if GOOS == "darwin" || GOOS == "ios" {
            atomic.Xadd(&pendingPreemptSignals, 1)
        }

        // If multiple threads are preempting the same M, it may send many
        // signals to the same M such that it hardly make progress, causing
        // live-lock problem. Apparently this could happen on darwin. See
        // issue #37741.
        // Only send a signal if there isn't already one pending.
        //
        // 如果多个线程抢占同一个M，它可能会向同一个M发送许多信号，
        // 使其几乎无法取得进展，从而导致实时锁定问题。
        // 显然这可能发生在darwin身上。只有在还没有挂起的情况下才发送信号。
        // const sigPreempt int = _SIGURG
        // const _SIGURG = 0x17
        signalM(mp, sigPreempt)
    }

    if GOOS == "darwin" || GOOS == "ios" {
        execLock.runlock()
    }
}

signalM()

signalM向mp发送信号。
文件位置：go1.19.3/src/runtime/os_linux.go。

551
552
553
554
555
556
557
558
559
560
561
562


// signalM sends a signal to mp.
func signalM(mp *m, sig int) {
    // 将信号sig发送到线程组tgid中具有线程ID tid的线程。
    // int tgkill(int tgid, int tid, int sig);
    //  1. tgid：为线程组中主线程的线程ID，或者称为进程号。
    //    其实它能起到保护的作用，防止向错误的线程发送信号。
    //    比如向线程ID为1234的线程发送信号时，很可能线程1234早就退出了，
    //    而线程ID 1234恰好被内核分配给了另一个不相干的进程。
    //  2. tid：线程ID。
    //  3. sig：信号值。sigPreempt = _SIGURG = 0x17。
    tgkill(getpid(), int(mp.procid), sig)
}

tgkill()

系统调用 tgkill() 函数向进程内的线程发送信号。
文件位置：go1.19.3/src/runtime/sys_linux_amd64.s。

176
177
178
179
180
181
182


TEXT ·tgkill(SB),NOSPLIT,$0
    MOVQ    tgid+0(FP), DI
    MOVQ    tid+8(FP), SI
    MOVQ    sig+16(FP), DX
    MOVL    $SYS_tgkill, AX
    SYSCALL # 进入系统调用
    RET

全局信号处理注册

mstart1()

主线程启动运行到mstart()->mstart0()->mstart1()函数内时。
文件位置：go1.19.3/src/runtime/proc.go。

1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412


// The go:noinline is to guarantee the getcallerpc/getcallersp below are safe,
// so that we can set up g0.sched to return to the call of mstart1 above.
//
//go:noinline
func mstart1() {
     // ...
    
    asminit()
    minit()
    
    // Install signal handlers; after minit so that minit can
    // prepare the thread to be able to handle the signals.
    // 
    // 安装信号处理程序;在minit之后，以便minit可以准备线程，以便能够处理信号。
    if gp.m == &m0 {
        mstartm0()
    }
    // ...
}

mstartm0()

initsig(false)则是注册信号处理相关。
文件位置：go1.19.3/src/runtime/proc.go。

1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450


// mstartm0 implements part of mstart1 that only runs on the m0.
//
// Write barriers are allowed here because we know the GC can't be
// running yet, so they'll be no-ops.
//
//go:yeswritebarrierrec
func mstartm0() {
    // Create an extra M for callbacks on threads not created by Go.
    // An extra M is also needed on Windows for callbacks created by
    // syscall.NewCallback. See issue #6751 for details.
    if (iscgo || GOOS == "windows") && !cgoHasExtraM {
        cgoHasExtraM = true
        newextram()
    }
    initsig(false)
}

initsig()

信号注册。
文件位置：go1.19.3/src/runtime/signal_unix.go。

109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158


// Initialize signals.
// Called by libpreinit so runtime may not be initialized.
//
//go:nosplit
//go:nowritebarrierrec
func initsig(preinit bool) {
    if !preinit {
        // It's now OK for signal handlers to run.
        // 
        // 现在可以运行信号处理程序了。
        signalsOK = true
    }

    // For c-archive/c-shared this is called by libpreinit with
    // preinit == true.
    if (isarchive || islibrary) && !preinit {
        return
    }

    // 遍历信号数组
    // const _NSIG int = 65; 
    for i := uint32(0); i < _NSIG; i++ {
        // sigtable 全局变量存储的是所有信号及描述
        t := &sigtable[i]
        // const _SigDefault int = 16;
        // 如果信号没有被显式请求，就不要监视它
        // 略过信号，SIGKILL、SIGSTOP、SIGTSTP、SIGCONT、SIGTTIN、SIGTTOU
        if t.flags == 0 || t.flags&_SigDefault != 0 {
            continue
        }

        // We don't need to use atomic operations here because
        // there shouldn't be any other goroutines running yet.
        fwdSig[i] = getsig(i)

        if !sigInstallGoHandler(i) {
            // Even if we are not installing a signal handler,
            // set SA_ONSTACK if necessary.
            if fwdSig[i] != _SIG_DFL && fwdSig[i] != _SIG_IGN {
                setsigstack(i)
            } else if fwdSig[i] == _SIG_IGN {
                sigInitIgnored(i)
            }
            continue
        }

        handlingSig[i] = 1
        setsig(i, abi.FuncPCABIInternal(sighandler))
    }
}

setsig()

这里需要注意的是，当 fn 等于 sighandler 的时候，调用的函数会被替换成 sigtramp。
sigaction 函数在 Linux 下会调用系统调用函数 sys_signal 以及 sys_rt_sigaction 实现安装信号。
文件位置：go1.19.3/src/runtime/os_linux.go。

109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131


//go:nosplit
//go:nowritebarrierrec
func setsig(i uint32, fn uintptr) {
    var sa sigactiont
    sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK | _SA_RESTORER | _SA_RESTART
    sigfillset(&sa.sa_mask)
    // Although Linux manpage says "sa_restorer element is obsolete and
    // should not be used". x86_64 kernel requires it. Only use it on
    // x86.
    if GOARCH == "386" || GOARCH == "amd64" {
        sa.sa_restorer = abi.FuncPCABI0(sigreturn)
    }
    if fn == abi.FuncPCABIInternal(sighandler) { // abi.FuncPCABIInternal(sighandler) matches the callers in signal_unix.go
        if iscgo {
            fn = abi.FuncPCABI0(cgoSigtramp)
        } else {
            // 替换为调用 sigtramp
            fn = abi.FuncPCABI0(sigtramp)
        }
    }
    sa.sa_handler = fn
    sigaction(i, &sa, nil)
}

信号形式响应抢占

sigtramp()

函数原型：func sigtramp()。
sigtramp()实际上是真正的信号处理函数，进程从内核态收到信号回到用户态调用的处理函数就是它。
注释中表明这个函数以C语言的调用惯例被调用，Go在这里通过PUSH_REGS_HOST_TO_ABI0保存go自己调用惯例用的寄存器后，
转换成自己的调用规范，等函数调用完毕之后，再通过POP_REGS_HOST_TO_ABI0恢复这些寄存器的值。
调度路径sigtramp()->sigtrampgo()->sighandler()->doSigPreempt()。
这里会被调用说明信号已经发送响应了，runtime·sigtramp会进行信号的处理。
runtime·sigtramp会继续调用runtime·sigtrampgo。
文件位置：go1.19.3/src/runtime/sys_linux_amd64.s。

341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367


# Called using C ABI.
TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME,$0
    # Transition from C ABI to Go ABI.
    PUSH_REGS_HOST_TO_ABI0()

    # Set up ABIInternal environment: g in R14, cleared X15.
    get_tls(R12)        # TLS
    MOVQ    g(R12), R14 # R14 = g
    PXOR    X15, X15

    # Reserve space for spill slots.
    NOP SP  # disable vet stack checking
    ADJSP   $24

    # Call into the Go signal handler
    #
    # 内核修改用户态寄存器时设置的 rdi、rsi、rdx
    # 三个寄存器的值就是内核模仿调用sigtramp时传入的参数
    MOVQ    DI, AX	# sig
    MOVQ    SI, BX	# info
    MOVQ    DX, CX	# ctx
    CALL    ·sigtrampgo<ABIInternal>(SB)

    ADJSP   $-24

    POP_REGS_HOST_TO_ABI0()
    RET

sigtrampgo()

文件位置：go1.19.3/src/runtime/signal_unix.go。

420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484


// sigtrampgo is called from the signal handler function, sigtramp,
// written in assembly code.
// This is called by the signal handler, and the world may be stopped.
//
// It must be nosplit because getg() is still the G that was running
// (if any) when the signal was delivered, but it's (usually) called
// on the gsignal stack. Until this switches the G to gsignal, the
// stack bounds check won't work.
//
//go:nosplit
//go:nowritebarrierrec
func sigtrampgo(sig uint32, info *siginfo, ctx unsafe.Pointer) {
    if sigfwdgo(sig, info, ctx) {
        return
    }
    c := &sigctxt{info, ctx}
    gp := sigFetchG(c) // g
    setg(gp)
    if gp == nil {
        if sig == _SIGPROF {
            // Some platforms (Linux) have per-thread timers, which we use in
            // combination with the process-wide timer. Avoid double-counting.
            if validSIGPROF(nil, c) {
                sigprofNonGoPC(c.sigpc())
            }
            return
        }
        if sig == sigPreempt && preemptMSupported && debug.asyncpreemptoff == 0 {
            // This is probably a signal from preemptM sent
            // while executing Go code but received while
            // executing non-Go code.
            // We got past sigfwdgo, so we know that there is
            // no non-Go signal handler for sigPreempt.
            // The default behavior for sigPreempt is to ignore
            // the signal, so badsignal will be a no-op anyway.
            if GOOS == "darwin" || GOOS == "ios" {
                pendingPreemptSignals.Add(-1)
            }
            return
        }
        c.fixsigcode(sig)
        badsignal(uintptr(sig), c)
        return
    }

    setg(gp.m.gsignal)

    // If some non-Go code called sigaltstack, adjust.
    var gsignalStack gsignalStack
    setStack := adjustSignalStack(sig, gp.m, &gsignalStack)
    if setStack {
        gp.m.gsignal.stktopsp = getcallersp()
    }

    if gp.stackguard0 == stackFork {
        signalDuringFork(sig)
    }

    c.fixsigcode(sig)
    sighandler(sig, info, ctx, gp)
    setg(gp)
    if setStack {
        restoreGsignalStack(&gsignalStack)
    }
}

sighander()

响应抢占。调度路径sigtramp()->sigtrampgo()->sighandler()->doSigPreempt()。
文件位置：go1.19.3/src/runtime/signal_unix.go。

597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624


// sighandler is invoked when a signal occurs. The global g will be
// set to a gsignal goroutine and we will be running on the alternate
// signal stack. The parameter g will be the value of the global g
// when the signal occurred. The sig, info, and ctxt parameters are
// from the system signal handler: they are the parameters passed when
// the SA is passed to the sigaction system call.
//
// The garbage collector may have stopped the world, so write barriers
// are not allowed.
//
//go:nowritebarrierrec
func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
    // ... ...
        // sig == sigPreempt：抢占信号
        // debug.asyncpreemptoff == 0：没有禁止抢占
        // delayedSignal：延迟信号?
        if sig == sigPreempt && debug.asyncpreemptoff == 0 && !delayedSignal {
            // Might be a preemption signal.
            // 可能是一个抢占信号。
            doSigPreempt(gp, c)
            // Even if this was definitely a preemption signal, it
            // may have been coalesced with another signal, so we
            // still let it through to the application.
            // 即使这确实是一个抢占信号，它可能已经与另一个信号合并，所以我们仍然让它通过应用程序。
        }
    
    // ... ...
}

doSigPreempt()

doSigPreempt处理gp上的抢占信号。
调用到doSigPreempt时，会将ctx这个参数传入，其中包含了进程用户态硬件上下文
ctxt的类型为*sigctxt，指向的是用户态堆栈中存放内核态堆栈内容的地址。
然后信号处理程序通过isAsyncSafePoint来判断抢占位置是否安全，并返回安全的抢占地址。
如果确认抢占没有问题，接着会调用pushCall方法来修改ctxt中的用户态硬件上下文，
用于稍后再一次从内核态返回用户态时模拟出一个用户态程序调用asyncPreempt的假象。
文件位置：go1.19.3/src/runtime/signal_unix.go。

340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364


// doSigPreempt handles a preemption signal on gp.
func doSigPreempt(gp *g, ctxt *sigctxt) {
    // Check if this G wants to be preempted and is safe to
    // preempt.
    // 检查这个G是否希望被抢占，并且抢占是安全的。
    // 通过 wantAsyncPreempt 函数确认runtime确实想要对指定的G实施异步抢占
    if wantAsyncPreempt(gp) {
        // 通过isAsyncSafePoint函数确认G当前执行上下文是能够安全地进行异步抢占的。
        if ok, newpc := isAsyncSafePoint(gp, ctxt.sigpc(), ctxt.sigsp(), ctxt.siglr()); ok {
            // Adjust the PC and inject a call to asyncPreempt.
            // 以上两个函数都确认无误后，才通过pushCall向G的执行上下文中注入一个函数调用，
            // 要调用的目标函数是 runtime.asyncPreempt 函数。这是一个汇编函数，它会先把各个寄存器的值保存在栈上，
            // 也就是将现场保存在栈上，然后调用 runtime.asyncPreempt2函数。
            ctxt.pushCall(abi.FuncPCABI0(asyncPreempt), newpc) // 就是向当前运行的goroutine注册加入asyncPreempt函数
        }
    }

    // Acknowledge the preemption.
    atomic.Xadd(&gp.m.preemptGen, 1)
    atomic.Store(&gp.m.signalPending, 0)

    if GOOS == "darwin" || GOOS == "ios" {
        atomic.Xadd(&pendingPreemptSignals, -1)
    }
}

wantAsyncPreempt()

wantAsyncPreempt返回异步抢占是否为gp排队。
文件位置：go1.19.3/src/runtime/preempt.go。

340
341
342
343
344
345
346
347
348
349
350


// wantAsyncPreempt returns whether an asynchronous preemption is
// queued for gp.
func wantAsyncPreempt(gp *g) bool {
    // Check both the G and the P.
    // 同时检查G和P的preempt字段，并且G当前需要处于_Grunning状态。
    // 在每轮调度循环中，P和G的preempt字段都会被置为false，所以这个检测能够避免刚刚切换至一个新的G后马上又被抢占。
    // gp.preempt || gp.m.p != 0 && gp.m.p.ptr().preempt：判断G或P的preempt抢占标识位。
    // readgstatus(gp)&^_Gscan == _Grunning：当前G正在运行状态。
    // 确认是否设置了抢占标志
    return (gp.preempt || gp.m.p != 0 && gp.m.p.ptr().preempt) && readgstatus(gp)&^_Gscan == _Grunning
}

isAsyncSafePoint()

它从以下几个方面来保证在当前位置进行异步抢占是安全的。
1. 可以挂起G并安全的扫描它的栈和寄存器，没有潜在的隐藏指针，而且当前并没有打断一个写屏障。
2. G还有足够的栈空间来注入一个对asyncPreempt()函数的调用。
3. 可以安全地和 runtime 进行交互，例如未持有 runtime 相关的锁，因此在尝试获得锁时不会造成死锁。
isAsyncSafePoint报告指令PC上的gp是否是异步安全点。这表明：
1. 暂停gp并保守地扫描它的堆栈和寄存器是安全的。它没有潜在的隐藏指针值，也不像写屏障那样位于原子序列的中间。
2. gp有足够的堆栈空间注入asyncPreempt调用。
3. 通常情况下，与运行时交互是安全的，即使我们在信号处理程序中就停在这里。例如，没有持有运行时锁，因此获取运行时锁不会自死锁。
在某些情况下，PC是安全的异步抢占，但它也需要调整恢复PC。新的PC在第二个结果中返回。
文件位置：go1.19.3/src/runtime/preempt.go。

347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460


// isAsyncSafePoint reports whether gp at instruction PC is an
// asynchronous safe point. This indicates that:
//
// 1. It's safe to suspend gp and conservatively scan its stack and
// registers. There are no potentially hidden pointer values and it's
// not in the middle of an atomic sequence like a write barrier.
//
// 2. gp has enough stack space to inject the asyncPreempt call.
//
// 3. It's generally safe to interact with the runtime, even if we're
// in a signal handler stopped here. For example, there are no runtime
// locks held, so acquiring a runtime lock won't self-deadlock.
//
// In some cases the PC is safe for asynchronous preemption but it
// also needs to adjust the resumption PC. The new PC is returned in
// the second result.
func isAsyncSafePoint(gp *g, pc, sp, lr uintptr) (bool, uintptr) {
    mp := gp.m

    // Only user Gs can have safe-points. We check this first
    // because it's extremely common that we'll catch mp in the
    // scheduler processing this G preemption.
    //
    // 只有用户Gs可以有安全点。我们首先检查这个，因为在处理G抢占的调度器中捕获mp是非常常见的。
    if mp.curg != gp {
        return false, 0
    }

    // Check M state.
    // 检查M状态。
    // canPreemptM(mp) -> mp.locks == 0 && mp.mallocing == 0 && mp.preemptoff == "" && mp.p.ptr().status == _Prunning
    if mp.p == 0 || !canPreemptM(mp) {
        return false, 0
    }

    // Check stack space.
    // 检查栈空间。
    // asyncPreemptStack是注入一个asyncPreempt调用所需的栈空间的字节。
    if sp < gp.stack.lo || sp-gp.stack.lo < asyncPreemptStack {
        return false, 0
    }

    // Check if PC is an unsafe-point.
    // 检查PC是否为不安全点。
    f := findfunc(pc)
    if !f.valid() {
        // Not Go code.
        return false, 0
    }
    if (GOARCH == "mips" || GOARCH == "mipsle" || GOARCH == "mips64" || GOARCH == "mips64le") && lr == pc+8 && funcspdelta(f, pc, nil) == 0 {
        // We probably stopped at a half-executed CALL instruction,
        // where the LR is updated but the PC has not. If we preempt
        // here we'll see a seemingly self-recursive call, which is in
        // fact not.
        // This is normally ok, as we use the return address saved on
        // stack for unwinding, not the LR value. But if this is a
        // call to morestack, we haven't created the frame, and we'll
        // use the LR for unwinding, which will be bad.
        return false, 0
    }
    up, startpc := pcdatavalue2(f, _PCDATA_UnsafePoint, pc)
    if up == _PCDATA_UnsafePointUnsafe {
        // Unsafe-point marked by compiler. This includes
        // atomic sequences (e.g., write barrier) and nosplit
        // functions (except at calls).
        return false, 0
    }
    if fd := funcdata(f, _FUNCDATA_LocalsPointerMaps); fd == nil || f.flag&funcFlag_ASM != 0 {
        // This is assembly code. Don't assume it's well-formed.
        // TODO: Empirically we still need the fd == nil check. Why?
        //
        // TODO: Are there cases that are safe but don't have a
        // locals pointer map, like empty frame functions?
        // It might be possible to preempt any assembly functions
        // except the ones that have funcFlag_SPWRITE set in f.flag.
        return false, 0
    }
    name := funcname(f)
    if inldata := funcdata(f, _FUNCDATA_InlTree); inldata != nil {
        inltree := (*[1 << 20]inlinedCall)(inldata)
        ix := pcdatavalue(f, _PCDATA_InlTreeIndex, pc, nil)
        if ix >= 0 {
            name = funcnameFromNameoff(f, inltree[ix].func_)
        }
    }
    if hasPrefix(name, "runtime.") ||
        hasPrefix(name, "runtime/internal/") ||
        hasPrefix(name, "reflect.") {
        // For now we never async preempt the runtime or
        // anything closely tied to the runtime. Known issues
        // include: various points in the scheduler ("don't
        // preempt between here and here"), much of the defer
        // implementation (untyped info on stack), bulk write
        // barriers (write barrier check),
        // reflect.{makeFuncStub,methodValueCall}.
        //
        // TODO(austin): We should improve this, or opt things
        // in incrementally.
        return false, 0
    }
    switch up {
    case _PCDATA_Restart1, _PCDATA_Restart2:
        // Restartable instruction sequence. Back off PC to
        // the start PC.
        if startpc == 0 || startpc > pc || pc-startpc > 20 {
            throw("bad restart PC")
        }
        return true, startpc
    case _PCDATA_RestartAtEntry:
        // Restart from the function entry at resumption.
        return true, f.entry()
    }
    return true, pc
}

pushCall()

pushCall干了两件事：
- 修改程序计数器的指向为asyncPreempt函数的地址。
- 修改栈顶指针，将当前 goroutine 的原本中断地址放入堆栈。
文件位置：go1.19.3/src/runtime/signal_amd64.go。
先把SP向下移动一个指针大小的位置，把PC的值存入栈上SP指向的位置，然后将PC的值更新为targetPC。
这样就模拟了一条CALL指令的效果，栈上存入的PC的旧值就相当于返回地址。
此时整个执行上下文的状态就像是goroutine在被信号打断的位置额外执行了一条CALL targetPC指令。
由于执行流程刚刚跳转到targetPC地址处，所以还没来得及执行目标地址处的指令。
当sighandler()函数处理完信号并返回后，被打断的goroutine得以继续执行，会立即调用被注入的asyncPreempt()函数。经过一连串的函数调用，最终执行到schedule()函数。
参数：
- targetPC uintptr：asyncPreempt 函数的执行入口地址。
- resumePC uintptr：其实就是发生中断前当前goroutine的下一指令地址，也就是PC的值。

80
81
82
83
84
85
86
87
88
89


func (c *sigctxt) pushCall(targetPC, resumePC uintptr) {
    // Make it look like we called target at resumePC.
    // 让它看起来像我们在resumePC上调用了target。
    sp := uintptr(c.rsp()) // 当前goroutine的SP
    sp -= goarch.PtrSize
    *(*uintptr)(unsafe.Pointer(sp)) = resumePC
    // 设置当前中断保存的上下文信息，因为中断结束后从这里恢复。
    c.set_rsp(uint64(sp))       // 修改中断保存的上下文SP
    c.set_rip(uint64(targetPC)) // 修改中断保存的上下文PC
}

asyncPreempt()

中断信号函数处理完后，goroutine得到运行，继续从嵌入的本函数开始执行。
文件位置：go1.19.3/src/runtime/preempt_amd64.s。

 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86


TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
    PUSHQ BP        # BP入栈
    MOVQ SP, BP     # BP = SP
    # Save flags before clobbering them
    PUSHFQ
    # obj doesn't understand ADD/SUB on SP, but does understand ADJSP
    ADJSP $368
    # But vet doesn't know ADJSP, so suppress vet stack checking
    NOP SP
    MOVQ AX, 0(SP)
    MOVQ CX, 8(SP)
    MOVQ DX, 16(SP)
    MOVQ BX, 24(SP)
    MOVQ SI, 32(SP)
    MOVQ DI, 40(SP)
    MOVQ R8, 48(SP)
    MOVQ R9, 56(SP)
    MOVQ R10, 64(SP)
    MOVQ R11, 72(SP)
    MOVQ R12, 80(SP)
    MOVQ R13, 88(SP)
    MOVQ R14, 96(SP)
    MOVQ R15, 104(SP)
    #ifdef GOOS_darwin
    CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0
    JE 2(PC)
    VZEROUPPER
    #endif
    MOVUPS X0, 112(SP)
    MOVUPS X1, 128(SP)
    MOVUPS X2, 144(SP)
    MOVUPS X3, 160(SP)
    MOVUPS X4, 176(SP)
    MOVUPS X5, 192(SP)
    MOVUPS X6, 208(SP)
    MOVUPS X7, 224(SP)
    MOVUPS X8, 240(SP)
    MOVUPS X9, 256(SP)
    MOVUPS X10, 272(SP)
    MOVUPS X11, 288(SP)
    MOVUPS X12, 304(SP)
    MOVUPS X13, 320(SP)
    MOVUPS X14, 336(SP)
    MOVUPS X15, 352(SP)
    CALL ·asyncPreempt2(SB) # 调用asyncPreempt2

    # 下次goroutine再度被运行起来时，从这里恢复。
    MOVUPS 352(SP), X15
    MOVUPS 336(SP), X14
    MOVUPS 320(SP), X13
    MOVUPS 304(SP), X12
    MOVUPS 288(SP), X11
    MOVUPS 272(SP), X10
    MOVUPS 256(SP), X9
    MOVUPS 240(SP), X8
    MOVUPS 224(SP), X7
    MOVUPS 208(SP), X6
    MOVUPS 192(SP), X5
    MOVUPS 176(SP), X4
    MOVUPS 160(SP), X3
    MOVUPS 144(SP), X2
    MOVUPS 128(SP), X1
    MOVUPS 112(SP), X0
    MOVQ 104(SP), R15
    MOVQ 96(SP), R14
    MOVQ 88(SP), R13
    MOVQ 80(SP), R12
    MOVQ 72(SP), R11
    MOVQ 64(SP), R10
    MOVQ 56(SP), R9
    MOVQ 48(SP), R8
    MOVQ 40(SP), DI
    MOVQ 32(SP), SI
    MOVQ 24(SP), BX
    MOVQ 16(SP), DX
    MOVQ 8(SP), CX
    MOVQ 0(SP), AX
    ADJSP $-368
    POPFQ
    POPQ BP
    RET # 返回继续去执行原来的goroutine代码

asyncPreempt2()

文件位置：go1.19.3/src/runtime/preempt.go。

301
302
303
304
305
306
307
308
309
310
311
312
313
314


//go:nosplit
func asyncPreempt2() {
    gp := getg()
    gp.asyncSafePoint = true
    // preemptStop 主要在GC标记期间被用来挂起运行中的 goroutine
    if gp.preemptStop {
        // preemptPark会把当前g切换至_Gpreempted状态，然后调用schedule函数
        mcall(preemptPark)
    } else {
        // 通过preemptone函数发起的异步抢占会调用gopreempt_m函数，它最终也会调用schedule函数
        mcall(gopreempt_m)
    }
    gp.asyncSafePoint = false
}

gopreempt_m()

文件位置：go1.19.3/src/runtime/proc.go。

3402
3403
3404
3405
3406
3407


func gopreempt_m(gp *g) {
    if trace.enabled {
        traceGoPreempt()
    }
    goschedImpl(gp)
}

goschedImpl()

G加入全局队列，解除G与M的关系，再次发起调度循环。
文件位置：go1.19.3/src/runtime/proc.go。

3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380


func goschedImpl(gp *g) {
    status := readgstatus(gp) // 获取G状态
    if status&^_Gscan != _Grunning {
        dumpgstatus(gp)
        throw("bad g status")
    }
    // 修改G状态 _Grunnable
    casgstatus(gp, _Grunning, _Grunnable)
    dropg() // 解除绑定关系
    lock(&sched.lock)
    globrunqput(gp) // 加入全局链表
    unlock(&sched.lock)

    schedule() // 调度循环
}

抢占标识#

retake()#

incidlelocked()#

preemptone()#

handoffp()#

响应抢占请求#

morestack_noctxt()#

morestack()#

newstack()#

canPreemptM#

gopreempt_m#

系统调用前后#

系统调用#

Syscall6()#

系统调用前#

entersyscall()#

reentersyscall()#

save()#

系统调用后#

exitsyscall()#

exitsyscallfast()#

exitsyscallfast_pidle()#

exitsyscall0()#

信号形式发送抢占#

preemptM()#

signalM()#

tgkill()#

全局信号处理注册#

mstart1()#

mstartm0()#

initsig()#

setsig()#

信号形式响应抢占#

sigtramp()#

sigtrampgo()#

sighander()#

doSigPreempt()#

wantAsyncPreempt()#

isAsyncSafePoint()#

pushCall()#

asyncPreempt()#

asyncPreempt2()#

gopreempt_m()#

goschedImpl()#

抢占标识

retake()

incidlelocked()

preemptone()

handoffp()

响应抢占请求

morestack_noctxt()

morestack()

newstack()

canPreemptM

gopreempt_m

系统调用前后

系统调用

Syscall6()

系统调用前

entersyscall()

reentersyscall()

save()

系统调用后

exitsyscall()

exitsyscallfast()

exitsyscallfast_pidle()

exitsyscall0()

信号形式发送抢占

preemptM()

signalM()

tgkill()

全局信号处理注册

mstart1()

mstartm0()

initsig()

setsig()

信号形式响应抢占

sigtramp()

sigtrampgo()

sighander()

doSigPreempt()

wantAsyncPreempt()

isAsyncSafePoint()

pushCall()

asyncPreempt()

asyncPreempt2()

gopreempt_m()

goschedImpl()