package benchmarks

import (
	"fmt"
	"runtime"
	"runtime/debug"
	"sync"
	"testing"
	"time"
)

const (
	maxRecordSize = 16384 // tls.MaxRecordSize
	sizeHeader    = 5     // tls.SizeHeader
)

var sink byte

// stackGoroutineRealistic simulates doppel start() with realistic buffer USE.
// The key: merely declaring [16384]byte doesn't grow the stack. Actually
// writing into it (via copy in the write loop) triggers the lazy stack growth
// from 2KB -> 32KB.
func stackGoroutineRealistic(done <-chan struct{}, wg *sync.WaitGroup, payload []byte) {
	// goroutine 1: start() with 16KB stack buffer, actually used
	wg.Add(1)
	go func() {
		defer wg.Done()
		var buf [maxRecordSize]byte
		// Simulate the write path in doppel start():
		//   n, _ := c.p.writeStream.Read(buf[tls.SizeHeader : tls.SizeHeader+size])
		//   tls.WriteRecordInPlace(c.Conn, buf[:], n)
		copy(buf[sizeHeader:], payload)
		<-done
		runtime.KeepAlive(&buf)
	}()

	// goroutine 2: clock tick loop
	wg.Add(1)
	go func() {
		defer wg.Done()
		ticker := time.NewTicker(50 * time.Millisecond)
		defer ticker.Stop()
		for {
			select {
			case <-done:
				return
			case <-ticker.C:
			}
		}
	}()
}

var bufPool = sync.Pool{
	New: func() any {
		b := make([]byte, maxRecordSize)
		return &b
	},
}

// poolGoroutineRealistic simulates the same pair with pool-based buffer.
func poolGoroutineRealistic(done <-chan struct{}, wg *sync.WaitGroup, payload []byte) {
	// goroutine 1: start() with pooled buffer
	wg.Add(1)
	go func() {
		defer wg.Done()
		bp := bufPool.Get().(*[]byte)
		buf := *bp
		copy(buf[sizeHeader:], payload)
		defer bufPool.Put(bp)
		<-done
		runtime.KeepAlive(&buf)
	}()

	// goroutine 2: clock tick loop
	wg.Add(1)
	go func() {
		defer wg.Done()
		ticker := time.NewTicker(50 * time.Millisecond)
		defer ticker.Stop()
		for {
			select {
			case <-done:
				return
			case <-ticker.C:
			}
		}
	}()
}

// measureMem forces GC and returns MemStats.
func measureMem() runtime.MemStats {
	runtime.GC()
	runtime.GC()
	var m runtime.MemStats
	runtime.ReadMemStats(&m)
	return m
}

// TestDoppelStackGrowthMechanism demonstrates that [16384]byte on the goroutine
// stack only triggers growth when the buffer is ACTUALLY WRITTEN TO (not just
// declared). Go's lazy stack growth means the stack guard page must be hit.
func TestDoppelStackGrowthMechanism(t *testing.T) {
	debug.SetGCPercent(-1)
	defer debug.SetGCPercent(100)

	const N = 2000
	payload := make([]byte, 1400) // typical TLS payload
	for i := range payload {
		payload[i] = byte(i)
	}

	// Phase 1: goroutines that declare [16384]byte but only touch buf[0]
	{
		runtime.GC()
		time.Sleep(50 * time.Millisecond)
		before := measureMem()

		done := make(chan struct{})
		var wg sync.WaitGroup
		for i := 0; i < N; i++ {
			wg.Add(1)
			go func() {
				defer wg.Done()
				var buf [maxRecordSize]byte
				buf[0] = 1
				<-done
				runtime.KeepAlive(&buf)
			}()
		}
		time.Sleep(200 * time.Millisecond)
		after := measureMem()

		stackPerG := (after.StackInuse - before.StackInuse) / N
		t.Logf("DECLARE-ONLY: stack/goroutine = %d bytes (stack not grown)", stackPerG)

		close(done)
		wg.Wait()
	}

	runtime.GC()
	time.Sleep(100 * time.Millisecond)

	// Phase 2: goroutines that actually copy() into the buffer (realistic)
	{
		runtime.GC()
		time.Sleep(50 * time.Millisecond)
		before := measureMem()

		done := make(chan struct{})
		var wg sync.WaitGroup
		for i := 0; i < N; i++ {
			wg.Add(1)
			go func() {
				defer wg.Done()
				var buf [maxRecordSize]byte
				copy(buf[sizeHeader:], payload)
				<-done
				runtime.KeepAlive(&buf)
			}()
		}
		time.Sleep(200 * time.Millisecond)
		after := measureMem()

		stackPerG := (after.StackInuse - before.StackInuse) / N
		t.Logf("COPY-INTO:    stack/goroutine = %d bytes (stack grown to 32KB)", stackPerG)

		close(done)
		wg.Wait()
	}

	runtime.GC()
	time.Sleep(100 * time.Millisecond)

	// Phase 3: pool-based with copy (realistic alternative)
	{
		runtime.GC()
		time.Sleep(50 * time.Millisecond)
		before := measureMem()

		done := make(chan struct{})
		var wg sync.WaitGroup
		for i := 0; i < N; i++ {
			wg.Add(1)
			go func() {
				defer wg.Done()
				bp := bufPool.Get().(*[]byte)
				buf := *bp
				copy(buf[sizeHeader:], payload)
				defer bufPool.Put(bp)
				<-done
				runtime.KeepAlive(&buf)
			}()
		}
		time.Sleep(200 * time.Millisecond)
		after := measureMem()

		stackPerG := (after.StackInuse - before.StackInuse) / N
		heapPerG := (after.HeapInuse - before.HeapInuse) / N
		t.Logf("POOL-BASED:   stack/goroutine = %d bytes, heap/goroutine = %d bytes",
			stackPerG, heapPerG)

		close(done)
		wg.Wait()
	}
}

// TestDoppelCombinedOverhead measures the memory of the full doppel Conn pair
// (start goroutine + clock goroutine) at various concurrency levels.
// Uses realistic buffer usage pattern that triggers stack growth.
func TestDoppelCombinedOverhead(t *testing.T) {
	payload := make([]byte, 1400)
	for i := range payload {
		payload[i] = byte(i)
	}

	for _, n := range []int{500, 1000, 2000} {
		t.Run(fmt.Sprintf("N=%d", n), func(t *testing.T) {
			debug.SetGCPercent(-1)
			defer debug.SetGCPercent(100)

			// Stack-allocated approach (current code pattern)
			var stackTotal uint64
			{
				runtime.GC()
				time.Sleep(50 * time.Millisecond)
				before := measureMem()

				done := make(chan struct{})
				var wg sync.WaitGroup
				for i := 0; i < n; i++ {
					stackGoroutineRealistic(done, &wg, payload)
				}
				time.Sleep(200 * time.Millisecond)
				after := measureMem()

				stackMem := after.StackInuse - before.StackInuse
				heapMem := after.HeapInuse - before.HeapInuse
				stackTotal = stackMem + heapMem

				t.Logf("STACK: %d conns (2 goroutines each = %d goroutines)", n, n*2)
				t.Logf("  StackInuse: %d KB (%d bytes/conn)", stackMem/1024, stackMem/uint64(n))
				t.Logf("  HeapInuse:  %d KB (%d bytes/conn)", heapMem/1024, heapMem/uint64(n))
				t.Logf("  Total:      %d KB (%.1f MB)", (stackMem+heapMem)/1024,
					float64(stackMem+heapMem)/(1024*1024))

				close(done)
				wg.Wait()
			}

			runtime.GC()
			time.Sleep(100 * time.Millisecond)

			// Pool-based approach
			{
				runtime.GC()
				time.Sleep(50 * time.Millisecond)
				before := measureMem()

				done := make(chan struct{})
				var wg sync.WaitGroup
				for i := 0; i < n; i++ {
					poolGoroutineRealistic(done, &wg, payload)
				}
				time.Sleep(200 * time.Millisecond)
				after := measureMem()

				stackMem := after.StackInuse - before.StackInuse
				heapMem := after.HeapInuse - before.HeapInuse
				poolTotal := stackMem + heapMem

				t.Logf("POOL:  %d conns (2 goroutines each = %d goroutines)", n, n*2)
				t.Logf("  StackInuse: %d KB (%d bytes/conn)", stackMem/1024, stackMem/uint64(n))
				t.Logf("  HeapInuse:  %d KB (%d bytes/conn)", heapMem/1024, heapMem/uint64(n))
				t.Logf("  Total:      %d KB (%.1f MB)", (stackMem+heapMem)/1024,
					float64(stackMem+heapMem)/(1024*1024))

				savings := int64(stackTotal) - int64(poolTotal)
				t.Logf("SAVINGS: %d KB total (%d bytes/conn), %.0f%% reduction",
					savings/1024, savings/int64(n),
					float64(savings)/float64(stackTotal)*100)

				close(done)
				wg.Wait()
			}
		})
	}
}

// BenchmarkDoppelBufStack benchmarks goroutine pair lifecycle with stack buffer.
func BenchmarkDoppelBufStack(b *testing.B) {
	payload := make([]byte, 1400)
	for b.Loop() {
		done := make(chan struct{})
		var wg sync.WaitGroup
		stackGoroutineRealistic(done, &wg, payload)
		close(done)
		wg.Wait()
	}
}

// BenchmarkDoppelBufPool benchmarks goroutine pair lifecycle with pool buffer.
func BenchmarkDoppelBufPool(b *testing.B) {
	payload := make([]byte, 1400)
	for b.Loop() {
		done := make(chan struct{})
		var wg sync.WaitGroup
		poolGoroutineRealistic(done, &wg, payload)
		close(done)
		wg.Wait()
	}
}

// BenchmarkDoppelThroughputStack simulates write throughput with stack buffer.
func BenchmarkDoppelThroughputStack(b *testing.B) {
	payload := make([]byte, 1400)
	for i := range payload {
		payload[i] = byte(i)
	}
	b.SetBytes(int64(len(payload)))

	for b.Loop() {
		var buf [maxRecordSize]byte
		copy(buf[sizeHeader:], payload)
		sink = buf[sizeHeader]
	}
}

// BenchmarkDoppelThroughputPool simulates write throughput with pooled buffer.
func BenchmarkDoppelThroughputPool(b *testing.B) {
	payload := make([]byte, 1400)
	for i := range payload {
		payload[i] = byte(i)
	}
	b.SetBytes(int64(len(payload)))

	for b.Loop() {
		bp := bufPool.Get().(*[]byte)
		buf := *bp
		copy(buf[sizeHeader:], payload)
		sink = buf[sizeHeader]
		bufPool.Put(bp)
	}
}