package relay

import (
	"fmt"
	"io"
	"net"
	"runtime"
	"sync"
	"sync/atomic"
	"testing"
	"time"

	"github.com/9seconds/mtg/v2/mtglib/internal/tls"
)

// ============================================================
// Stress test: N concurrent connections, each transferring dataSize bytes.
// Measures total wall-clock time, aggregate throughput, peak memory, GC pauses.
// This is the closest simulation to real proxy load.
// ============================================================

type stressResult struct {
	totalBytes    int64
	wallTime      time.Duration
	gcPauseTotal  time.Duration
	numGC         uint32
	peakStackMB   float64
	peakHeapMB    float64
	peakTotalMB   float64
	throughputMBs float64
}

func runStressTest(b *testing.B, numConns int, dataPerConn int, getBuf func() []byte, putBuf func([]byte)) stressResult {
	b.Helper()

	// Force GC before measuring
	runtime.GC()
	runtime.GC()

	var memBefore runtime.MemStats
	runtime.ReadMemStats(&memBefore)

	var totalTransferred atomic.Int64
	var wg sync.WaitGroup

	start := time.Now()

	// Launch all connections concurrently
	for i := 0; i < numConns; i++ {
		wg.Add(1)
		go func() {
			defer wg.Done()

			serverConn, clientConn := net.Pipe()

			// Writer goroutine: send data
			go func() {
				data := make([]byte, 32*1024) // write in 32KB chunks
				written := 0
				for written < dataPerConn {
					toWrite := len(data)
					if dataPerConn-written < toWrite {
						toWrite = dataPerConn - written
					}
					n, err := serverConn.Write(data[:toWrite])
					written += n
					if err != nil {
						break
					}
				}
				serverConn.Close()
			}()

			// Reader goroutine (the relay pump simulation)
			buf := getBuf()
			n, _ := io.CopyBuffer(io.Discard, clientConn, buf)
			putBuf(buf)
			totalTransferred.Add(n)
			clientConn.Close()
		}()
	}

	wg.Wait()
	elapsed := time.Since(start)

	var memAfter runtime.MemStats
	runtime.ReadMemStats(&memAfter)

	gcPause := time.Duration(memAfter.PauseTotalNs-memBefore.PauseTotalNs) * time.Nanosecond
	numGC := memAfter.NumGC - memBefore.NumGC

	total := totalTransferred.Load()
	throughput := float64(total) / elapsed.Seconds() / (1024 * 1024)

	return stressResult{
		totalBytes:    total,
		wallTime:      elapsed,
		gcPauseTotal:  gcPause,
		numGC:         numGC,
		peakStackMB:   float64(memAfter.StackInuse) / (1024 * 1024),
		peakHeapMB:    float64(memAfter.HeapInuse) / (1024 * 1024),
		peakTotalMB:   float64(memAfter.StackInuse+memAfter.HeapInuse) / (1024 * 1024),
		throughputMBs: throughput,
	}
}

func reportStress(b *testing.B, r stressResult) {
	b.ReportMetric(r.throughputMBs, "MB/s")
	b.ReportMetric(r.peakStackMB, "peak_stack_MB")
	b.ReportMetric(r.peakHeapMB, "peak_heap_MB")
	b.ReportMetric(r.peakTotalMB, "peak_total_MB")
	b.ReportMetric(float64(r.gcPauseTotal.Microseconds()), "gc_pause_us")
	b.ReportMetric(float64(r.numGC), "gc_cycles")
}

// BenchmarkStress_ConcurrentRelays runs N concurrent relay pumps with different
// buffer strategies and measures aggregate throughput + memory + GC.
func BenchmarkStress_ConcurrentRelays(b *testing.B) {
	type bufStrategy struct {
		name   string
		getBuf func() []byte
		putBuf func([]byte)
	}

	pool16 := &sync.Pool{New: func() any { buf := make([]byte, tls.MaxRecordPayloadSize); return &buf }}
	pool4 := &sync.Pool{New: func() any { buf := make([]byte, 4096); return &buf }}

	strategies := []bufStrategy{
		{
			name:   "stack_16KB",
			getBuf: func() []byte { buf := make([]byte, tls.MaxRecordPayloadSize); return buf },
			putBuf: func([]byte) {},
		},
		{
			name:   "pool_16KB",
			getBuf: func() []byte { return *pool16.Get().(*[]byte) },
			putBuf: func(b []byte) { pool16.Put(&b) },
		},
		{
			name:   "pool_4KB",
			getBuf: func() []byte { return *pool4.Get().(*[]byte) },
			putBuf: func(b []byte) { pool4.Put(&b) },
		},
	}

	// Test scenarios
	type scenario struct {
		conns       int
		dataPerConn int
		label       string
	}

	scenarios := []scenario{
		{100, 10 * 1024 * 1024, "100conn_10MB"},   // 100 connections × 10 MB = 1 GB total
		{500, 10 * 1024 * 1024, "500conn_10MB"},   // 500 × 10 MB = 5 GB total
		{1000, 10 * 1024 * 1024, "1000conn_10MB"}, // 1000 × 10 MB = 10 GB total
		{2000, 1 * 1024 * 1024, "2000conn_1MB"},   // 2000 × 1 MB = 2 GB (many short conns)
		{500, 50 * 1024 * 1024, "500conn_50MB"},   // 500 × 50 MB = 25 GB (big files)
	}

	for _, sc := range scenarios {
		for _, strat := range strategies {
			name := fmt.Sprintf("%s/%s", sc.label, strat.name)
			getBuf := strat.getBuf
			putBuf := strat.putBuf
			sc := sc

			b.Run(name, func(b *testing.B) {
				for i := 0; i < b.N; i++ {
					r := runStressTest(b, sc.conns, sc.dataPerConn, getBuf, putBuf)
					reportStress(b, r)
				}
			})
		}
	}
}

// BenchmarkStress_PoolContention specifically tests sync.Pool under heavy
// concurrent access — many goroutines doing Get/Put rapidly.
func BenchmarkStress_PoolContention(b *testing.B) {
	pool := &sync.Pool{New: func() any { buf := make([]byte, tls.MaxRecordPayloadSize); return &buf }}

	for _, numWorkers := range []int{100, 500, 1000, 2000} {
		b.Run(fmt.Sprintf("workers=%d", numWorkers), func(b *testing.B) {
			b.RunParallel(func(pb *testing.PB) {
				for pb.Next() {
					bp := pool.Get().(*[]byte)
					// Simulate minimal work with the buffer
					(*bp)[0] = 1
					(*bp)[len(*bp)-1] = 1
					pool.Put(bp)
				}
			})
		})
	}
}

// BenchmarkStress_TinyPackets simulates massive amounts of tiny packets
// (chat messages, typing indicators, status updates, ACKs).
// Each connection sends many small writes — this maximizes per-read overhead.
func BenchmarkStress_TinyPackets(b *testing.B) {
	type bufStrategy struct {
		name   string
		getBuf func() []byte
		putBuf func([]byte)
	}

	pool16 := &sync.Pool{New: func() any { buf := make([]byte, tls.MaxRecordPayloadSize); return &buf }}
	pool4 := &sync.Pool{New: func() any { buf := make([]byte, 4096); return &buf }}

	strategies := []bufStrategy{
		{
			name:   "stack_16KB",
			getBuf: func() []byte { return make([]byte, tls.MaxRecordPayloadSize) },
			putBuf: func([]byte) {},
		},
		{
			name:   "pool_16KB",
			getBuf: func() []byte { return *pool16.Get().(*[]byte) },
			putBuf: func(b []byte) { pool16.Put(&b) },
		},
		{
			name:   "pool_4KB",
			getBuf: func() []byte { return *pool4.Get().(*[]byte) },
			putBuf: func(b []byte) { pool4.Put(&b) },
		},
	}

	type scenario struct {
		conns      int
		pktSize    int
		pktsPerConn int
		label      string
	}

	scenarios := []scenario{
		// Chat-like: 100 connections, 50K tiny packets each (50 bytes = typing indicator / small ACK)
		{100, 50, 50000, "100conn_50B_x50K"},
		// Heavy chat: 500 connections, 10K packets of 200 bytes
		{500, 200, 10000, "500conn_200B_x10K"},
		// Extreme: 1000 connections, 20K packets of 100 bytes each
		{1000, 100, 20000, "1000conn_100B_x20K"},
		// Burst of tiny: 2000 connections, 5K packets of 50 bytes
		{2000, 50, 5000, "2000conn_50B_x5K"},
	}

	for _, sc := range scenarios {
		for _, strat := range strategies {
			name := fmt.Sprintf("%s/%s", sc.label, strat.name)
			getBuf := strat.getBuf
			putBuf := strat.putBuf
			sc := sc

			b.Run(name, func(b *testing.B) {
				totalBytes := int64(sc.conns) * int64(sc.pktSize) * int64(sc.pktsPerConn)
				b.SetBytes(totalBytes)

				for i := 0; i < b.N; i++ {
					runtime.GC()
					var memBefore runtime.MemStats
					runtime.ReadMemStats(&memBefore)

					var totalRead atomic.Int64
					var totalReads atomic.Int64
					var wg sync.WaitGroup

					start := time.Now()

					for c := 0; c < sc.conns; c++ {
						wg.Add(1)
						go func() {
							defer wg.Done()
							serverConn, clientConn := net.Pipe()

							go func() {
								pkt := make([]byte, sc.pktSize)
								for p := 0; p < sc.pktsPerConn; p++ {
									serverConn.Write(pkt)
								}
								serverConn.Close()
							}()

							buf := getBuf()
							var reads int64
							for {
								n, err := clientConn.Read(buf)
								if n > 0 {
									totalRead.Add(int64(n))
									reads++
								}
								if err != nil {
									break
								}
							}
							putBuf(buf)
							totalReads.Add(reads)
							clientConn.Close()
						}()
					}

					wg.Wait()
					elapsed := time.Since(start)

					var memAfter runtime.MemStats
					runtime.ReadMemStats(&memAfter)

					throughput := float64(totalRead.Load()) / elapsed.Seconds() / (1024 * 1024)
					pps := float64(totalReads.Load()) / elapsed.Seconds()

					b.ReportMetric(throughput, "MB/s")
					b.ReportMetric(pps, "packets/s")
					b.ReportMetric(float64(totalReads.Load()), "total_reads")
					b.ReportMetric(float64(memAfter.StackInuse)/(1024*1024), "peak_stack_MB")
					b.ReportMetric(float64(memAfter.HeapInuse)/(1024*1024), "peak_heap_MB")
					b.ReportMetric(float64(memAfter.NumGC-memBefore.NumGC), "gc_cycles")
					b.ReportMetric(float64(memAfter.PauseTotalNs-memBefore.PauseTotalNs)/1000, "gc_pause_us")
				}
			})
		}
	}
}

// BenchmarkStress_GCPressure measures how GC behaves under load.
// Stack-allocated buffers don't create GC work; pool buffers do.
// This tests whether pool-induced GC pressure hurts throughput.
func BenchmarkStress_GCPressure(b *testing.B) {
	numConns := 500
	dataPerConn := 10 * 1024 * 1024

	pool16 := &sync.Pool{New: func() any { buf := make([]byte, tls.MaxRecordPayloadSize); return &buf }}

	b.Run("stack_16KB", func(b *testing.B) {
		for i := 0; i < b.N; i++ {
			runtime.GC()
			var memBefore runtime.MemStats
			runtime.ReadMemStats(&memBefore)

			r := runStressTest(b, numConns, dataPerConn, func() []byte {
				buf := make([]byte, tls.MaxRecordPayloadSize)
				return buf
			}, func([]byte) {})

			var memAfter runtime.MemStats
			runtime.ReadMemStats(&memAfter)

			b.ReportMetric(r.throughputMBs, "MB/s")
			b.ReportMetric(float64(memAfter.NumGC-memBefore.NumGC), "gc_cycles")
			b.ReportMetric(float64(memAfter.PauseTotalNs-memBefore.PauseTotalNs)/1000, "gc_pause_us")
			b.ReportMetric(float64(memAfter.StackInuse)/(1024*1024), "final_stack_MB")
			b.ReportMetric(float64(memAfter.HeapInuse)/(1024*1024), "final_heap_MB")
		}
	})

	b.Run("pool_16KB", func(b *testing.B) {
		for i := 0; i < b.N; i++ {
			runtime.GC()
			var memBefore runtime.MemStats
			runtime.ReadMemStats(&memBefore)

			r := runStressTest(b, numConns, dataPerConn, func() []byte {
				return *pool16.Get().(*[]byte)
			}, func(buf []byte) {
				pool16.Put(&buf)
			})

			var memAfter runtime.MemStats
			runtime.ReadMemStats(&memAfter)

			b.ReportMetric(r.throughputMBs, "MB/s")
			b.ReportMetric(float64(memAfter.NumGC-memBefore.NumGC), "gc_cycles")
			b.ReportMetric(float64(memAfter.PauseTotalNs-memBefore.PauseTotalNs)/1000, "gc_pause_us")
			b.ReportMetric(float64(memAfter.StackInuse)/(1024*1024), "final_stack_MB")
			b.ReportMetric(float64(memAfter.HeapInuse)/(1024*1024), "final_heap_MB")
		}
	})
}