Golang 中的defer性能提升

在Golang 1.14中新加入了开放编码(Open-coded)defer类型,编译器在ssa过程中会把被延迟的方法直接插入到函数的尾部,避免了运行时的deferproc及deferprocStack操作。

避免了在没有运行时判断下的deferreturn调用。如有运行时判断的逻辑,则 deferreturn 也进一步优化,开放编码下的 deferreturn 不会进行jmpdefer的尾递归调用,而直接在一个循环里遍历执行。

在1.14中defer的实现原理,共有三种defer模式类型,编译后一个函数里只会一种defer模式。

堆上分配

在 Golang 1.13 之前的版本中,所有 defer 都是在堆上分配 (deferProc),该机制在编译时会进行两个步骤:

  1. 在 defer 语句的位置插入 runtime.deferproc,当被执行时,延迟调用会被保存为一个 _defer 记录,并将被延迟调用的入口地址及其参数复制保存,存入 Goroutine 的调用链表中。

  2. 在函数返回之前的位置插入 runtime.deferreturn,当被执行时,会将延迟调用从 Goroutine 链表中取出并执行,多个延迟调用则以 jmpdefer 尾递归调用方式连续执行。

这种机制的主要性能问题存在于每个 defer 语句产生记录时的内存分配,以及记录参数和完成调用时参数移动的系统调用开销。

栈上分配

在Golang 1.13 版本中新加入 deferprocStack 实现了在栈上分配的形式来取代 deferproc,相比后者,栈上分配在函数返回后 _defer 便得到释放,省去了内存分配时产生的性能开销,只需适当维护 _defer 的链表即可。

编译器可以去选择使用deferproc 还是 deferprocStack,通常情况下都会使用deferprocStack,性能会提升约 30%。不过在 defer 语句出现在了循环语句里,或者无法执行更高阶的编译器优化时,亦或者同一个函数中使用了过多的 defer 时,依然会使用 deferproc

栈上分配 (deferprocStack),基本跟堆上差不多,只是分配方式改为在栈上分配,压入的函数调用栈存有_defer记录,另外编译器在ssa过程中会预留defer空间。

SSA 代表 static single-assignment,是一种IR(中间表示代码),要保证每个变量只被赋值一次。这个能帮助简化编译器的优化算法。简单来说,使用ssa可以使二进制文件大小减少了30%,性能提升5%-35%等.

  1. // buildssa builds an SSA function for fn.
  2. // worker indicates which of the backend workers is doing the processing.
  3. func buildssa(fn *Node, worker int) *ssa.Func {
  4. name := fn.funcname()
  5. printssa := name == ssaDump
  6. var astBuf *bytes.Buffer
  7. if printssa {
  8. astBuf = &bytes.Buffer{}
  9. fdumplist(astBuf, "buildssa-enter", fn.Func.Enter)
  10. fdumplist(astBuf, "buildssa-body", fn.Nbody)
  11. fdumplist(astBuf, "buildssa-exit", fn.Func.Exit)
  12. if ssaDumpStdout {
  13. fmt.Println("generating SSA for", name)
  14. fmt.Print(astBuf.String())
  15. }
  16. }
  17. var s state
  18. s.pushLine(fn.Pos)
  19. defer s.popLine()
  20. s.hasdefer = fn.Func.HasDefer()
  21. if fn.Func.Pragma&CgoUnsafeArgs != 0 {
  22. s.cgoUnsafeArgs = true
  23. }
  24. fe := ssafn{
  25. curfn: fn,
  26. log: printssa && ssaDumpStdout,
  27. }
  28. s.curfn = fn
  29. s.f = ssa.NewFunc(&fe)
  30. s.config = ssaConfig
  31. s.f.Type = fn.Type
  32. s.f.Config = ssaConfig
  33. s.f.Cache = &ssaCaches[worker]
  34. s.f.Cache.Reset()
  35. s.f.DebugTest = s.f.DebugHashMatch("GOSSAHASH", name)
  36. s.f.Name = name
  37. s.f.PrintOrHtmlSSA = printssa
  38. if fn.Func.Pragma&Nosplit != 0 {
  39. s.f.NoSplit = true
  40. }
  41. s.panics = map[funcLine]*ssa.Block{}
  42. s.softFloat = s.config.SoftFloat
  43. if printssa {
  44. s.f.HTMLWriter = ssa.NewHTMLWriter(ssaDumpFile, s.f.Frontend(), name, ssaDumpCFG)
  45. // TODO: generate and print a mapping from nodes to values and blocks
  46. dumpSourcesColumn(s.f.HTMLWriter, fn)
  47. s.f.HTMLWriter.WriteAST("AST", astBuf)
  48. }
  49. // Allocate starting block
  50. s.f.Entry = s.f.NewBlock(ssa.BlockPlain)
  51. // Allocate starting values
  52. s.labels = map[string]*ssaLabel{}
  53. s.labeledNodes = map[*Node]*ssaLabel{}
  54. s.fwdVars = map[*Node]*ssa.Value{}
  55. s.startmem = s.entryNewValue0(ssa.OpInitMem, types.TypeMem)
  56. s.hasOpenDefers = Debug['N'] == 0 && s.hasdefer && !s.curfn.Func.OpenCodedDeferDisallowed()
  57. switch {
  58. case s.hasOpenDefers && (Ctxt.Flag_shared || Ctxt.Flag_dynlink) && thearch.LinkArch.Name == "386":
  59. // Don't support open-coded defers for 386 ONLY when using shared
  60. // libraries, because there is extra code (added by rewriteToUseGot())
  61. // preceding the deferreturn/ret code that is generated by gencallret()
  62. // that we don't track correctly.
  63. s.hasOpenDefers = false
  64. }
  65. if s.hasOpenDefers && s.curfn.Func.Exit.Len() > 0 {
  66. // Skip doing open defers if there is any extra exit code (likely
  67. // copying heap-allocated return values or race detection), since
  68. // we will not generate that code in the case of the extra
  69. // deferreturn/ret segment.
  70. s.hasOpenDefers = false
  71. }
  72. if s.hasOpenDefers &&
  73. s.curfn.Func.numReturns*s.curfn.Func.numDefers > 15 {
  74. // Since we are generating defer calls at every exit for
  75. // open-coded defers, skip doing open-coded defers if there are
  76. // too many returns (especially if there are multiple defers).
  77. // Open-coded defers are most important for improving performance
  78. // for smaller functions (which don't have many returns).
  79. s.hasOpenDefers = false
  80. }
  81. s.sp = s.entryNewValue0(ssa.OpSP, types.Types[TUINTPTR]) // TODO: use generic pointer type (unsafe.Pointer?) instead
  82. s.sb = s.entryNewValue0(ssa.OpSB, types.Types[TUINTPTR])
  83. s.startBlock(s.f.Entry)
  84. s.vars[&memVar] = s.startmem
  85. if s.hasOpenDefers {
  86. // Create the deferBits variable and stack slot. deferBits is a
  87. // bitmask showing which of the open-coded defers in this function
  88. // have been activated.
  89. deferBitsTemp := tempAt(src.NoXPos, s.curfn, types.Types[TUINT8])
  90. s.deferBitsTemp = deferBitsTemp
  91. // For this value, AuxInt is initialized to zero by default
  92. startDeferBits := s.entryNewValue0(ssa.OpConst8, types.Types[TUINT8])
  93. s.vars[&deferBitsVar] = startDeferBits
  94. s.deferBitsAddr = s.addr(deferBitsTemp, false)
  95. s.store(types.Types[TUINT8], s.deferBitsAddr, startDeferBits)
  96. // Make sure that the deferBits stack slot is kept alive (for use
  97. // by panics) and stores to deferBits are not eliminated, even if
  98. // all checking code on deferBits in the function exit can be
  99. // eliminated, because the defer statements were all
  100. // unconditional.
  101. s.vars[&memVar] = s.newValue1Apos(ssa.OpVarLive, types.TypeMem, deferBitsTemp, s.mem(), false)
  102. }
  103. // Generate addresses of local declarations
  104. s.decladdrs = map[*Node]*ssa.Value{}
  105. for _, n := range fn.Func.Dcl {
  106. switch n.Class() {
  107. case PPARAM, PPARAMOUT:
  108. s.decladdrs[n] = s.entryNewValue2A(ssa.OpLocalAddr, types.NewPtr(n.Type), n, s.sp, s.startmem)
  109. if n.Class() == PPARAMOUT && s.canSSA(n) {
  110. // Save ssa-able PPARAMOUT variables so we can
  111. // store them back to the stack at the end of
  112. // the function.
  113. s.returns = append(s.returns, n)
  114. }
  115. case PAUTO:
  116. // processed at each use, to prevent Addr coming
  117. // before the decl.
  118. case PAUTOHEAP:
  119. // moved to heap - already handled by frontend
  120. case PFUNC:
  121. // local function - already handled by frontend
  122. default:
  123. s.Fatalf("local variable with class %v unimplemented", n.Class())
  124. }
  125. }
  126. // Populate SSAable arguments.
  127. for _, n := range fn.Func.Dcl {
  128. if n.Class() == PPARAM && s.canSSA(n) {
  129. v := s.newValue0A(ssa.OpArg, n.Type, n)
  130. s.vars[n] = v
  131. s.addNamedValue(n, v) // This helps with debugging information, not needed for compilation itself.
  132. }
  133. }
  134. // Convert the AST-based IR to the SSA-based IR
  135. s.stmtList(fn.Func.Enter)
  136. s.stmtList(fn.Nbody)
  137. // fallthrough to exit
  138. if s.curBlock != nil {
  139. s.pushLine(fn.Func.Endlineno)
  140. s.exit()
  141. s.popLine()
  142. }
  143. for _, b := range s.f.Blocks {
  144. if b.Pos != src.NoXPos {
  145. s.updateUnsetPredPos(b)
  146. }
  147. }
  148. s.insertPhis()
  149. // Main call to ssa package to compile function
  150. ssa.Compile(s.f)
  151. if s.hasOpenDefers {
  152. s.emitOpenDeferInfo()
  153. }
  154. return s.f
  155. }

如果在构建ssa时如发现gcflags有N禁止优化的参数 或者 return数量 * defer数量超过了15不适用open-coded模式。

此外逃逸分析会判断循序的层数,如果有轮询,那么强制使用栈分配模式。

  1. // augmentParamHole augments parameter holes as necessary for use in
  2. // go/defer statements.
  3. func (e *Escape) augmentParamHole(k EscHole, call, where *Node) EscHole {
  4. k = k.note(call, "call parameter")
  5. if where == nil {
  6. return k
  7. }
  8. // Top level defers arguments don't escape to heap, but they
  9. // do need to last until end of function. Tee with a
  10. // non-transient location to avoid arguments from being
  11. // transiently allocated.
  12. if where.Op == ODEFER && e.loopDepth == 1 {
  13. // force stack allocation of defer record, unless open-coded
  14. // defers are used (see ssa.go)
  15. where.Esc = EscNever
  16. return e.later(k)
  17. }
  18. return e.heapHole().note(where, "call parameter")
  19. }

开放编码

Golang 1.14 版本继续加入了开发编码(open coded),该机制会将延迟调用直接插入函数返回之前,省去了运行时的 deferprocdeferprocStack 操作,在运行时的 deferreturn 也不会进行尾递归调用,而是直接在一个循环中遍历所有延迟函数执行。

这种机制使得 defer 的开销几乎可以忽略,唯一的运行时成本就是存储参与延迟调用的相关信息,不过使用这个机制还需要三个条件:

  1. 没有禁用编译器优化,即没有设置 -gcflags “-N”.
  2. 函数内 defer 的数量不超过 8 个,且返回语句与延迟语句个数的乘积不超过 15.
  3. defer 不是在循环语句中。

此外该机制还引入了一种元素 —— 延迟比特(defer bit),用于运行时记录每个 defer 是否被执行(尤其是在条件判断分支中的 defer),从而便于判断最后的延迟调用该执行哪些函数。

延迟比特的原理:

同一个函数内每出现一个 defer 都会为其分配 1个比特,如果被执行到则设为 1,否则设为 0,当到达函数返回之前需要判断延迟调用时,则用掩码判断每个位置的比特,若为 1 则调用延迟函数,否则跳过。

为了轻量,官方将延迟比特限制为 1 个字节,即 8 个比特,这就是为什么不能超过 8 个 defer 的原因,若超过依然会选择堆栈分配,但显然大部分情况不会超过 8 个。

  1. // The constant is known to runtime.
  2. const tmpstringbufsize = 32
  3. const zeroValSize = 1024 // must match value of runtime/map.go:maxZero
  4. func walk(fn *Node) {
  5. Curfn = fn
  6. if Debug['W'] != 0 {
  7. s := fmt.Sprintf("\nbefore walk %v", Curfn.Func.Nname.Sym)
  8. dumplist(s, Curfn.Nbody)
  9. }
  10. lno := lineno
  11. // Final typecheck for any unused variables.
  12. for i, ln := range fn.Func.Dcl {
  13. if ln.Op == ONAME && (ln.Class() == PAUTO || ln.Class() == PAUTOHEAP) {
  14. ln = typecheck(ln, ctxExpr|ctxAssign)
  15. fn.Func.Dcl[i] = ln
  16. }
  17. }
  18. // Propagate the used flag for typeswitch variables up to the NONAME in its definition.
  19. for _, ln := range fn.Func.Dcl {
  20. if ln.Op == ONAME && (ln.Class() == PAUTO || ln.Class() == PAUTOHEAP) && ln.Name.Defn != nil && ln.Name.Defn.Op == OTYPESW && ln.Name.Used() {
  21. ln.Name.Defn.Left.Name.SetUsed(true)
  22. }
  23. }
  24. for _, ln := range fn.Func.Dcl {
  25. if ln.Op != ONAME || (ln.Class() != PAUTO && ln.Class() != PAUTOHEAP) || ln.Sym.Name[0] == '&' || ln.Name.Used() {
  26. continue
  27. }
  28. if defn := ln.Name.Defn; defn != nil && defn.Op == OTYPESW {
  29. if defn.Left.Name.Used() {
  30. continue
  31. }
  32. yyerrorl(defn.Left.Pos, "%v declared but not used", ln.Sym)
  33. defn.Left.Name.SetUsed(true) // suppress repeats
  34. } else {
  35. yyerrorl(ln.Pos, "%v declared but not used", ln.Sym)
  36. }
  37. }
  38. lineno = lno
  39. if nerrors != 0 {
  40. return
  41. }
  42. walkstmtlist(Curfn.Nbody.Slice())
  43. if Debug['W'] != 0 {
  44. s := fmt.Sprintf("after walk %v", Curfn.Func.Nname.Sym)
  45. dumplist(s, Curfn.Nbody)
  46. }
  47. zeroResults()
  48. heapmoves()
  49. if Debug['W'] != 0 && Curfn.Func.Enter.Len() > 0 {
  50. s := fmt.Sprintf("enter %v", Curfn.Func.Nname.Sym)
  51. dumplist(s, Curfn.Func.Enter)
  52. }
  53. }

在使用open code的模式的时候,默认open coded最多支持8个defer,超过则取消。

  1. const maxOpenDefers = 8
  2. func walkstmt(n *Node) *Node {
  3. ...
  4. switch n.Op {
  5. case ODEFER:
  6. Curfn.Func.SetHasDefer(true)
  7. Curfn.Func.numDefers++
  8. if Curfn.Func.numDefers > maxOpenDefers {
  9. Curfn.Func.SetOpenCodedDeferDisallowed(true)
  10. }
  11. if n.Esc != EscNever {
  12. Curfn.Func.SetOpenCodedDeferDisallowed(true)
  13. }
  14. ...
  15. }

因此 open coded的使用条件是,最多8个defer,而且 return * defer < 15,无循环,gcflags无 “N” 并且取消优化。