| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343 |
- package benchmarks
-
- import (
- "fmt"
- "runtime"
- "runtime/debug"
- "sync"
- "testing"
- "time"
- )
-
- const (
- maxRecordSize = 16384 // tls.MaxRecordSize
- sizeHeader = 5 // tls.SizeHeader
- )
-
- var sink byte
-
- // stackGoroutineRealistic simulates doppel start() with realistic buffer USE.
- // The key: merely declaring [16384]byte doesn't grow the stack. Actually
- // writing into it (via copy in the write loop) triggers the lazy stack growth
- // from 2KB -> 32KB.
- func stackGoroutineRealistic(done <-chan struct{}, wg *sync.WaitGroup, payload []byte) {
- // goroutine 1: start() with 16KB stack buffer, actually used
- wg.Add(1)
- go func() {
- defer wg.Done()
- var buf [maxRecordSize]byte
- // Simulate the write path in doppel start():
- // n, _ := c.p.writeStream.Read(buf[tls.SizeHeader : tls.SizeHeader+size])
- // tls.WriteRecordInPlace(c.Conn, buf[:], n)
- copy(buf[sizeHeader:], payload)
- <-done
- runtime.KeepAlive(&buf)
- }()
-
- // goroutine 2: clock tick loop
- wg.Add(1)
- go func() {
- defer wg.Done()
- ticker := time.NewTicker(50 * time.Millisecond)
- defer ticker.Stop()
- for {
- select {
- case <-done:
- return
- case <-ticker.C:
- }
- }
- }()
- }
-
- var bufPool = sync.Pool{
- New: func() any {
- b := make([]byte, maxRecordSize)
- return &b
- },
- }
-
- // poolGoroutineRealistic simulates the same pair with pool-based buffer.
- func poolGoroutineRealistic(done <-chan struct{}, wg *sync.WaitGroup, payload []byte) {
- // goroutine 1: start() with pooled buffer
- wg.Add(1)
- go func() {
- defer wg.Done()
- bp := bufPool.Get().(*[]byte)
- buf := *bp
- copy(buf[sizeHeader:], payload)
- defer bufPool.Put(bp)
- <-done
- runtime.KeepAlive(&buf)
- }()
-
- // goroutine 2: clock tick loop
- wg.Add(1)
- go func() {
- defer wg.Done()
- ticker := time.NewTicker(50 * time.Millisecond)
- defer ticker.Stop()
- for {
- select {
- case <-done:
- return
- case <-ticker.C:
- }
- }
- }()
- }
-
- // measureMem forces GC and returns MemStats.
- func measureMem() runtime.MemStats {
- runtime.GC()
- runtime.GC()
- var m runtime.MemStats
- runtime.ReadMemStats(&m)
- return m
- }
-
- // TestDoppelStackGrowthMechanism demonstrates that [16384]byte on the goroutine
- // stack only triggers growth when the buffer is ACTUALLY WRITTEN TO (not just
- // declared). Go's lazy stack growth means the stack guard page must be hit.
- func TestDoppelStackGrowthMechanism(t *testing.T) {
- debug.SetGCPercent(-1)
- defer debug.SetGCPercent(100)
-
- const N = 2000
- payload := make([]byte, 1400) // typical TLS payload
- for i := range payload {
- payload[i] = byte(i)
- }
-
- // Phase 1: goroutines that declare [16384]byte but only touch buf[0]
- {
- runtime.GC()
- time.Sleep(50 * time.Millisecond)
- before := measureMem()
-
- done := make(chan struct{})
- var wg sync.WaitGroup
- for i := 0; i < N; i++ {
- wg.Add(1)
- go func() {
- defer wg.Done()
- var buf [maxRecordSize]byte
- buf[0] = 1
- <-done
- runtime.KeepAlive(&buf)
- }()
- }
- time.Sleep(200 * time.Millisecond)
- after := measureMem()
-
- stackPerG := (after.StackInuse - before.StackInuse) / N
- t.Logf("DECLARE-ONLY: stack/goroutine = %d bytes (stack not grown)", stackPerG)
-
- close(done)
- wg.Wait()
- }
-
- runtime.GC()
- time.Sleep(100 * time.Millisecond)
-
- // Phase 2: goroutines that actually copy() into the buffer (realistic)
- {
- runtime.GC()
- time.Sleep(50 * time.Millisecond)
- before := measureMem()
-
- done := make(chan struct{})
- var wg sync.WaitGroup
- for i := 0; i < N; i++ {
- wg.Add(1)
- go func() {
- defer wg.Done()
- var buf [maxRecordSize]byte
- copy(buf[sizeHeader:], payload)
- <-done
- runtime.KeepAlive(&buf)
- }()
- }
- time.Sleep(200 * time.Millisecond)
- after := measureMem()
-
- stackPerG := (after.StackInuse - before.StackInuse) / N
- t.Logf("COPY-INTO: stack/goroutine = %d bytes (stack grown to 32KB)", stackPerG)
-
- close(done)
- wg.Wait()
- }
-
- runtime.GC()
- time.Sleep(100 * time.Millisecond)
-
- // Phase 3: pool-based with copy (realistic alternative)
- {
- runtime.GC()
- time.Sleep(50 * time.Millisecond)
- before := measureMem()
-
- done := make(chan struct{})
- var wg sync.WaitGroup
- for i := 0; i < N; i++ {
- wg.Add(1)
- go func() {
- defer wg.Done()
- bp := bufPool.Get().(*[]byte)
- buf := *bp
- copy(buf[sizeHeader:], payload)
- defer bufPool.Put(bp)
- <-done
- runtime.KeepAlive(&buf)
- }()
- }
- time.Sleep(200 * time.Millisecond)
- after := measureMem()
-
- stackPerG := (after.StackInuse - before.StackInuse) / N
- heapPerG := (after.HeapInuse - before.HeapInuse) / N
- t.Logf("POOL-BASED: stack/goroutine = %d bytes, heap/goroutine = %d bytes",
- stackPerG, heapPerG)
-
- close(done)
- wg.Wait()
- }
- }
-
- // TestDoppelCombinedOverhead measures the memory of the full doppel Conn pair
- // (start goroutine + clock goroutine) at various concurrency levels.
- // Uses realistic buffer usage pattern that triggers stack growth.
- func TestDoppelCombinedOverhead(t *testing.T) {
- payload := make([]byte, 1400)
- for i := range payload {
- payload[i] = byte(i)
- }
-
- for _, n := range []int{500, 1000, 2000} {
- t.Run(fmt.Sprintf("N=%d", n), func(t *testing.T) {
- debug.SetGCPercent(-1)
- defer debug.SetGCPercent(100)
-
- // Stack-allocated approach (current code pattern)
- var stackTotal uint64
- {
- runtime.GC()
- time.Sleep(50 * time.Millisecond)
- before := measureMem()
-
- done := make(chan struct{})
- var wg sync.WaitGroup
- for i := 0; i < n; i++ {
- stackGoroutineRealistic(done, &wg, payload)
- }
- time.Sleep(200 * time.Millisecond)
- after := measureMem()
-
- stackMem := after.StackInuse - before.StackInuse
- heapMem := after.HeapInuse - before.HeapInuse
- stackTotal = stackMem + heapMem
-
- t.Logf("STACK: %d conns (2 goroutines each = %d goroutines)", n, n*2)
- t.Logf(" StackInuse: %d KB (%d bytes/conn)", stackMem/1024, stackMem/uint64(n))
- t.Logf(" HeapInuse: %d KB (%d bytes/conn)", heapMem/1024, heapMem/uint64(n))
- t.Logf(" Total: %d KB (%.1f MB)", (stackMem+heapMem)/1024,
- float64(stackMem+heapMem)/(1024*1024))
-
- close(done)
- wg.Wait()
- }
-
- runtime.GC()
- time.Sleep(100 * time.Millisecond)
-
- // Pool-based approach
- {
- runtime.GC()
- time.Sleep(50 * time.Millisecond)
- before := measureMem()
-
- done := make(chan struct{})
- var wg sync.WaitGroup
- for i := 0; i < n; i++ {
- poolGoroutineRealistic(done, &wg, payload)
- }
- time.Sleep(200 * time.Millisecond)
- after := measureMem()
-
- stackMem := after.StackInuse - before.StackInuse
- heapMem := after.HeapInuse - before.HeapInuse
- poolTotal := stackMem + heapMem
-
- t.Logf("POOL: %d conns (2 goroutines each = %d goroutines)", n, n*2)
- t.Logf(" StackInuse: %d KB (%d bytes/conn)", stackMem/1024, stackMem/uint64(n))
- t.Logf(" HeapInuse: %d KB (%d bytes/conn)", heapMem/1024, heapMem/uint64(n))
- t.Logf(" Total: %d KB (%.1f MB)", (stackMem+heapMem)/1024,
- float64(stackMem+heapMem)/(1024*1024))
-
- savings := int64(stackTotal) - int64(poolTotal)
- t.Logf("SAVINGS: %d KB total (%d bytes/conn), %.0f%% reduction",
- savings/1024, savings/int64(n),
- float64(savings)/float64(stackTotal)*100)
-
- close(done)
- wg.Wait()
- }
- })
- }
- }
-
- // BenchmarkDoppelBufStack benchmarks goroutine pair lifecycle with stack buffer.
- func BenchmarkDoppelBufStack(b *testing.B) {
- payload := make([]byte, 1400)
- for b.Loop() {
- done := make(chan struct{})
- var wg sync.WaitGroup
- stackGoroutineRealistic(done, &wg, payload)
- close(done)
- wg.Wait()
- }
- }
-
- // BenchmarkDoppelBufPool benchmarks goroutine pair lifecycle with pool buffer.
- func BenchmarkDoppelBufPool(b *testing.B) {
- payload := make([]byte, 1400)
- for b.Loop() {
- done := make(chan struct{})
- var wg sync.WaitGroup
- poolGoroutineRealistic(done, &wg, payload)
- close(done)
- wg.Wait()
- }
- }
-
- // BenchmarkDoppelThroughputStack simulates write throughput with stack buffer.
- func BenchmarkDoppelThroughputStack(b *testing.B) {
- payload := make([]byte, 1400)
- for i := range payload {
- payload[i] = byte(i)
- }
- b.SetBytes(int64(len(payload)))
-
- for b.Loop() {
- var buf [maxRecordSize]byte
- copy(buf[sizeHeader:], payload)
- sink = buf[sizeHeader]
- }
- }
-
- // BenchmarkDoppelThroughputPool simulates write throughput with pooled buffer.
- func BenchmarkDoppelThroughputPool(b *testing.B) {
- payload := make([]byte, 1400)
- for i := range payload {
- payload[i] = byte(i)
- }
- b.SetBytes(int64(len(payload)))
-
- for b.Loop() {
- bp := bufPool.Get().(*[]byte)
- buf := *bp
- copy(buf[sizeHeader:], payload)
- sink = buf[sizeHeader]
- bufPool.Put(bp)
- }
- }
|