#!/usr/bin/awk -f
#
# stats.awk -- stream statistics utility with 1-pass linear regression
# Rev.1.2 (2026-02-08)
# (c) 2026, Takayuki HOSODA
# SPDX-License-Identifier: BSD-3-Clause

BEGIN {
    # ---------- numeric format ----------
    if (prec == "") prec = 8
    if (prec !~ /^[0-9]+$/ || prec < 1 || prec > 16) {
        print "prec must be integer in [1,16]" > "/dev/stderr"
        exit 1
    }

    fmt = tolower(fmt)
    if (fmt == "" || fmt == "gen")   numfmt = "%." prec "g"
    else if (fmt == "sci")           numfmt = "%." prec "e"
    else if (fmt == "real")          numfmt = "%." prec "f"
    else {
        print "fmt must be real|sci|gen" > "/dev/stderr"
        exit 1
    }

    # ---------- output format ----------
    out = tolower(out)
    if (out == "") out = "text"
    if (out != "text" && out != "csv" && out != "json" && out != "tsv") {
        print "out must be text|csv|tsv|json" > "/dev/stderr"
        exit 1
    }

    # regression column defaults
    auto1d = 0
}

# ---------- helpers ----------

function pf(label, v) {
    printf "%-18s " numfmt "\n", label, v
}

function fmtv(v) {
    return sprintf(numfmt, v)
}

# ---------- main ----------

{
    # decide column mode once
    if (count == 0) {
        if (NF == 1 && xcol == "" && ycol == "") {
            auto1d = 1
        } else {
            if (xcol == "") xcol = 1
            if (ycol == "") ycol = xcol
        }
    }

    # ----- extract values -----
    if (auto1d) {
        y = $1 + 0
        x = count + 1
    } else {
        if (ycol > NF || xcol > NF) next
        y = $(ycol) + 0
        x = $(xcol) + 0
    }

    # ----- first sample -----
    if (++count == 1) {
        mean = y
        M2 = 0
        p1 = p2 = y
        n1 = n2 = y

        mean_x = x
        mean_y = y
        S_xx = 0
        C_xy = 0
        next
    }

    # ----- statistics on y -----
    d = y - mean
    mean += d / count
    M2 += d * (y - mean)

    if (y > p1) { p2 = p1; p1 = y }
    else if (y > p2) p2 = y

    if (y < n1) { n2 = n1; n1 = y }
    else if (y < n2) n2 = y

    # ----- regression (x,y) -----
    dx = x - mean_x
    mean_x += dx / count

    dy = y - mean_y
    mean_y += dy / count

    S_xx += dx * (x - mean_x)
    C_xy += dx * (y - mean_y)
}

# ---------- END ----------

END {
    if (count == 0) {
        print "No numeric data"
        exit
    }

    if (M2 < 0 && M2 > -1e-12) M2 = 0

    var = (count > 1) ? M2 / (count - 1) : 0
    std = sqrt(var)
    sum = mean * count

    if (count > 1 && S_xx != 0) {
        b = C_xy / S_xx
        a = mean_y - b * mean_x
    } else {
        b = 0
        a = mean_y
    }

    if (out == "text") {
        printf "%-18s %d\n", "Count:", count
        pf("Mean:", mean)
        pf("Sum:", sum)
        pf("Unbiased Variance:", var)
        pf("Unbiased StdDev:", std)
        pf("Peak (Max):", p1)
        pf("2nd Peak:", (count>=2?p2:p1))
        pf("Neg Peak (Min):", n1)
        pf("2nd Neg Peak:", (count>=2?n2:n1))
        pf("Regression a:", a)
        pf("Regression b:", b)
    }
    else if (out == "csv" || out == "tsv") {
        delim = (out == "csv") ? "," : "\t"
        printf "metric%cvalue\n", delim
        printf "count%c%d\n", delim, count
        printf "mean%c" numfmt "\n", delim, mean
        printf "sum%c" numfmt "\n", delim, sum
        printf "var%c" numfmt "\n", delim, var
        printf "std%c" numfmt "\n", delim, std
        printf "max%c" numfmt "\n", delim, p1
        printf "max2%c" numfmt "\n", delim, (count >= 2 ? p2 : p1)
        printf "min%c" numfmt "\n", delim, n1
        printf "min2%c" numfmt "\n", delim, (count >= 2 ? n2 : n1)
        printf "reg_a%c" numfmt "\n", delim, a
        printf "reg_b%c" numfmt "\n", delim, b
    }
    else if (out == "json") {
        print "{"
        printf "  \"count\": %d,\n", count
        printf "  \"mean\": %s,\n", fmtv(mean)
        printf "  \"sum\": %s,\n", fmtv(sum)
        printf "  \"var\": %s,\n", fmtv(var)
        printf "  \"std\": %s,\n", fmtv(std)
        printf "  \"max\": %s,\n", fmtv(p1)
        printf "  \"max2\": %s,\n", fmtv((count >= 2 ? p2 : p1))
        printf "  \"min\": %s,\n", fmtv(n1)
        printf "  \"min2\": %s,\n", fmtv((count >= 2 ? n2 : n1))
        printf "  \"regression\": {\n"
        printf "    \"a\": %s,\n", fmtv(a)
        printf "    \"b\": %s\n", fmtv(b)
        printf "  }\n"
        print "}"
    }
}

