github.com/dreamsxin/gota — Go 1.24+
A comprehensive implementation of DataFrames, Series and data wrangling methods for Go, inspired by pandas. The API is still evolving so use at your own risk.
- Installation
- DataFrame
- Loading data
- Get row data
- Subsetting & Slicing
- Column selection
- Updating values
- Filtering
- GroupBy, Aggregation, Apply & Transform
- Pivot
- Arrange
- Mutate
- Joins
- Function application
- Cumulative statistics
- Diff & PctChange
- FillNA with strategy and limit
- Correlation & Covariance
- Melt (wide → long)
- Excel I/O
- SQL I/O
- Index & MultiIndex
- Chaining operations
- Print to console
- Interfacing with gonum
- Data Exploration
- Missing Data Handling
- Value Operations
- Pipe
- Series
- New DataFrame APIs (v1.3+)
- New I/O APIs (v1.5+)
- License
go get github.com/dreamsxin/gotaRequires Go 1.24+. Key dependencies:
| Package | Purpose |
|---|---|
gonum.org/v1/gonum |
Numeric operations |
github.com/xuri/excelize/v2 |
Excel I/O (no CGO) |
modernc.org/sqlite |
SQL tests (pure Go SQLite) |
github.com/olekukonko/tablewriter |
Table formatting |
A DataFrame is a two-dimensional tabular dataset where columns represent features and rows represent observations. Columns maintain type integrity and support NaN (missing) values.
Construct from Series directly:
df := dataframe.New(
series.New([]string{"b", "a"}, series.String, "COL.1"),
series.New([]int{1, 2}, series.Int, "COL.2"),
series.New([]float64{3.0, 4.0}, series.Float, "COL.3"),
)From [][]string records:
df := dataframe.LoadRecords(
[][]string{
{"A", "B", "C", "D"},
{"a", "4", "5.1", "true"},
{"k", "5", "7.0", "true"},
{"k", "4", "6.0", "true"},
{"a", "2", "7.1", "false"},
},
)From a slice of structs:
type User struct {
Name string
Age int
Accuracy float64
ignored bool // unexported fields are ignored
}
users := []User{
{"Aram", 17, 0.2, true},
{"Juan", 18, 0.8, true},
{"Ana", 22, 0.5, true},
}
df := dataframe.LoadStructs(users)With explicit type configuration:
df := dataframe.LoadRecords(
records,
dataframe.DetectTypes(false),
dataframe.DefaultType(series.Float),
dataframe.WithTypes(map[string]series.Type{
"A": series.String,
"D": series.Bool,
}),
)From []map[string]interface{}:
df := dataframe.LoadMaps(
[]map[string]interface{}{
{"A": "a", "B": 1, "C": true, "D": 0},
{"A": "b", "B": 2, "C": true, "D": 0.5},
},
)From CSV / JSON readers:
df := dataframe.ReadCSV(strings.NewReader(csvStr))
df := dataframe.ReadJSON(strings.NewReader(jsonStr))From HTML tables:
dfs := dataframe.ReadHTML(r) // returns []DataFrame, one per tablerow := df.GetRow(0) // map[string]interface{}sub := df.Subset([]int{0, 2}) // rows by index
sub := df.SliceRow(1, 4) // rows [1, 4) half-open rangesel1 := df.Select([]int{0, 2})
sel2 := df.Select([]string{"A", "C"})
dropped := df.Drop([]string{"B"})df2 := df.Set(
[]int{0, 2},
dataframe.LoadRecords(
[][]string{
{"A", "B", "C", "D"},
{"b", "4", "6.0", "true"},
{"c", "3", "6.0", "false"},
},
),
)// OR filter (default)
fil := df.Filter(
dataframe.F{"A", series.Eq, "a"},
dataframe.F{"B", series.Greater, 4},
)
// Explicit OR
fil := df.FilterAggregation(dataframe.Or,
dataframe.F{"A", series.Eq, "a"},
dataframe.F{"B", series.Greater, 4},
)
// AND filter
fil := df.FilterAggregation(dataframe.And,
dataframe.F{"A", series.Eq, "a"},
dataframe.F{"D", series.Eq, true},
)Built-in comparators: Eq, Neq, Greater, GreaterEq, Less, LessEq, In, Out.
Custom comparator with series.CompFunc:
hasPrefix := func(prefix string) func(series.Element) bool {
return func(el series.Element) bool {
if val, ok := el.Val().(string); ok {
return strings.HasPrefix(val, prefix)
}
return false
}
}
fil := df.Filter(dataframe.F{"A", series.CompFunc, hasPrefix("aa")})groups := df.GroupBy("key1", "key2")
aggre := groups.Aggregation(
[]AggregationType{Aggregation_MAX, Aggregation_MIN},
[]string{"values", "values2"},
)Apply — arbitrary function per group (like pandas groupby().apply()):
result := df.GroupBy("category").Apply(func(g dataframe.DataFrame) dataframe.DataFrame {
return g.Capply(func(s series.Series) series.Series {
return series.Floats(s.Mean())
})
})Transform — per-group column transform aligned to original row order:
groups := df.GroupBy("category")
transformed, err := groups.Transform("value", func(s series.Series) series.Series {
mean := s.Mean()
vals := s.Float()
out := make([]float64, len(vals))
for i, v := range vals {
out[i] = v - mean
}
return series.Floats(out...)
})GetGroups — access the underlying group map:
groupMap := groups.GetGroups() // map[string]DataFramepivot := df.Pivot(
[]string{"A", "B"}, // row keys
[]string{"C", "D"}, // column keys
[]PivotValue{
{Colname: "E", AggregationType: Aggregation_SUM},
{Colname: "F", AggregationType: Aggregation_COUNT},
},
)sorted := df.Arrange(
dataframe.Sort("A"), // ascending
dataframe.RevSort("B"), // descending
)// Replace or add a column
mut := df.Mutate(series.New([]string{"a", "b", "c", "d"}, series.String, "C"))InnerJoin, LeftJoin, RightJoin, OuterJoin, CrossJoin:
join := df.InnerJoin(df2, "D")mean := func(s series.Series) series.Series {
floats := s.Float()
sum := 0.0
for _, f := range floats { sum += f }
return series.Floats(sum / float64(len(floats)))
}
df.Capply(mean) // column-wise
df.Rapply(mean) // row-wisecumDF := df.CumSum() // running sum, all numeric columns
cumProd := df.CumProd("price", "qty") // selected columns onlydiffDF := df.Diff(1) // row[i] - row[i-1]
pct := df.PctChange(2, "close", "volume") // % change over 2 periods// forward-fill, at most 2 consecutive NaNs
filled := df.FillNAStrategyLimit(dataframe.NAFillForward, 2)
// backward-fill with no limit (0 = unlimited), specific columns
filled := df.FillNAStrategyLimit(dataframe.NAFillBackward, 0, "col1", "col2")Also available: df.FillNAStrategy(strategy, subset...) (no limit),
df.DropNA(how, subset...) to drop rows with missing values,
df.DropDuplicates(subset...) to remove duplicate rows.
Returns a square DataFrame whose row/column names match the original numeric columns:
corrMatrix := df.Corr() // Pearson correlation matrix
covMatrix := df.Cov() // sample covariance matrixlong := df.Melt(
[]string{"id", "date"}, // id columns
[]string{"open", "high", "low", "close"}, // value columns (empty = all others)
"field", // variable column name
"value", // value column name
)Uses excelize — no CGO required.
// Read
df := dataframe.ReadXLSX(r)
df := dataframe.ReadXLSXFile("data.xlsx",
dataframe.HasHeader(true),
dataframe.WithTypes(map[string]series.Type{"price": series.Float}),
)
// Write
err := df.WriteXLSX(w)
err := df.WriteXLSXFile("output.xlsx")FromSQL — build a DataFrame from *sql.Rows:
rows, _ := db.Query("SELECT id, name, score FROM users WHERE active = 1")
df := dataframe.FromSQL(rows)WriteSQL — insert into a database table:
err := df.WriteSQL(db, "users",
dataframe.WithCreateTable(true), // CREATE TABLE IF NOT EXISTS
dataframe.WithTruncateFirst(true), // DELETE FROM before inserting
dataframe.WithBatchSize(200), // rows per INSERT (default 500)
)SQL ↔ Series type mapping:
| SQL type | Series type |
|---|---|
| INT / INTEGER / BIGINT … | series.Int |
| REAL / FLOAT / DOUBLE … | series.Float |
| BOOL / BOOLEAN | series.Bool |
| DATE / DATETIME / TIMESTAMP | series.Time |
| everything else | series.String |
Single-level Index
idx := dataframe.NewIndex([]string{"a", "b", "c", "d"})
idf, err := df.WithIndex(idx)
rows := idf.Loc("b") // all rows with label "b"
rows := idf.LocSlice("a", "c") // inclusive label slice
// Use a column as the index (drops that column)
idf, err := df.WithColumnIndex("id")
// Restore to plain DataFrame
plain := idf.ResetIndex("id")Multi-level Index
mi, err := dataframe.NewMultiIndex(
[]string{"2024", "2024", "2025", "2025"}, // level 0
[]string{"Q1", "Q2", "Q1", "Q2"}, // level 1
)
midf, err := df.WithMultiIndex(mi)
rows := midf.Loc("2024", "Q1") // full key
rows := midf.Loc("2024") // partial key (all 2024 rows)All methods return a new DataFrame and propagate errors — once an error occurs, subsequent operations become no-ops:
a = a.Rename("Origin", "Country").
Filter(dataframe.F{"Age", series.Less, 50}).
Filter(dataframe.F{"Origin", series.Eq, "United States"}).
Select([]string{"Id", "Origin", "Date"}).
Subset([]int{1, 3})
if a.Err != nil {
log.Fatal(a.Err)
}file, _ := os.Create("output.csv")
defer file.Close()
df.WriteCSV(file)
df.WriteJSON(w)fmt.Println(flights)
> [336776x20] DataFrame
>
> X0 year month day dep_time sched_dep_time dep_delay arr_time ...
> 0: 1 2013 1 1 517 515 2 830 ...
> ...type matrix struct{ dataframe.DataFrame }
func (m matrix) At(i, j int) float64 { return m.Elem(i, j).Float() }
func (m matrix) T() mat.Matrix { return mat.Transpose{m} }Load a gonum/mat.Matrix:
df := dataframe.LoadMatrix(mat)df.Head(5) // first 5 rows
df.Tail(10) // last 10 rowsdf.Describe() // summary statistics (count, mean, std, min, max, quartiles)df.Info(os.Stdout)
// Prints dimensions, column types, non-null counts, memory estimatevc := df.ValueCounts("category", false, false) // counts, descending
vc := df.ValueCounts("category", true, false) // proportionstop10 := df.NLargest(10, "revenue")
bottom5 := df.NSmallest(5, "price")sample := df.Sample(100, -1, false, 42) // 100 rows, fixed seed
sample := df.Sample(-1, 0.1, false, 42) // 10% of rows
sample := df.Sample(1000, -1, true, 42) // with replacementmask := df.IsNull() // or df.IsNA() — true where value is NaN
mask := df.NotNull() // or df.NotNA() — true where value is present
// Drop rows with any NaN (or all NaN) in subset of columns
df2 := df.DropNA(dataframe.NAHowAny, "col1", "col2")
df2 := df.DropNA(dataframe.NAHowAll) // only drop rows where ALL columns are NaN
// Drop duplicate rows
df2 := df.DropDuplicates("key1", "key2")lower, upper := 0.0, 100.0
df2 := df.Clip(&lower, &upper) // all numeric columns
df2 := df.ClipColumn("discount", &lower, &upper) // single columndf2 := df.Replace("N/A", nil) // whole DataFrame
df2 := df.ReplaceInColumn("status", "unknown", nil) // single columndf2 := df.Astype(map[string]series.Type{
"price": series.Float,
"qty": series.Int,
"active": series.Bool,
})mask := df.Between("age", 18, 65, "both") // "both"|"neither"|"left"|"right"
mask := df.IsIn("country", []interface{}{"US", "UK", "CA"})
df2 := df.FilterIsIn("country", []interface{}{"US", "UK", "CA"})result := df.
Filter(dataframe.F{"age", series.Greater, 18}).
Pipe(customTransform).
Arrange(dataframe.Sort("name"))
// With extra arguments
result := df.PipeWithArgs(customFunc, arg1, arg2)
// Element-wise map
df2 := df.ApplyMap(func(val interface{}) interface{} {
if s, ok := val.(string); ok {
return strings.ToUpper(s)
}
return val
})Series are typed vectors with NaN support. They are the building blocks for DataFrame columns.
Supported types: Int, Float, String, Bool, Time
s := series.New([]string{"b", "a"}, series.String, "COL.1")
// Convenience constructors
series.Strings(values)
series.Ints(values)
series.Floats(values)
series.Bools(values)
series.Times(values)Core methods: Len, Elem, Val, Float, Int, Int64, Bool, Records,
Copy, Subset, Set, Append, Concat, Slice, Map, Order, Unique,
NUnique, ValueCounts, HasNaN, IsNaN, FillNaN, Compare, Empty.
Statistics: Mean, StdDev, Median, Min, Max, MinStr, MaxStr,
Sum, Quantile.
NaN behaviour:
nilvalues and the string"NaN"are treated as missingInt(math.Inf(...))→ NaN;Float(math.NaN())→ NaN element- Comparison operators (
Eq,Less, etc.) always returnfalsewhen either operand is NaN Boolonly accepts0/1,true/false,t/f— other values become NaN
s := series.New([]interface{}{"a", "b", nil}, series.String, "COL.1")
s.FillNaN(series.Strings("c"))
s.FillNaNForward() // ffill: propagate last valid value forward
s.FillNaNBackward() // bfill: propagate next valid value backwards := series.New([]interface{}{1.0, nil, nil, nil, 5.0}, series.Float, "x")
s.FillNaNForwardLimit(1) // → [1, 1, NaN, NaN, 5] (fill at most 1 gap)
s.FillNaNBackwardLimit(0) // → [1, 5, 5, 5, 5] (0 = unlimited)s := series.New([]float64{1, 2, 3, 4, 5}, series.Float, "x")
s.Rolling(3).Mean()
s.Rolling(3).MinPeriods(1).Mean() // emit result with at least 1 observation
s.Rolling(3).Sum()
s.Rolling(3).Min() // O(n) monotonic deque algorithm
s.Rolling(3).Max() // O(n) monotonic deque algorithm
s.Rolling(3).StdDev() // Bessel-corrected (ddof=1)
s.Rolling(3).Apply(func(w []float64) float64 {
return w[len(w)-1] - w[0]
})By default minPeriods equals the window size — leading positions without a
full window emit NaN. Use MinPeriods(1) to emit results as soon as one
observation is available.
### EWM (Exponentially Weighted Moving)
Mirrors the `pandas.ewm()` interface. `alpha = 2 / (span + 1)`.
```go
s := series.New([]float64{1, 2, 3, 4, 5}, series.Float, "price")
s.EWM(3).Mean() // adjusted mode (pandas default), span=3
s.EWMAlpha(0.5).Mean() // specify alpha directly (equivalent to span=3)
s.EWM(3).Adjust(false).Mean() // recursive mode: y[i] = α·x[i] + (1-α)·y[i-1]
s.EWM(3).MinPeriods(2).Mean() // NaN until 2 valid observations seen
s.EWM(3).IgnoreNA(true).Mean()
s.EWM(3).Var() // exponentially weighted variance (ddof=1)
s.EWM(3).Std() // sqrt of Var
| Mode | Formula |
|---|---|
Adjust(true) (default) |
Σ (1-α)^k · x[i-k] / Σ (1-α)^k |
Adjust(false) |
y[i] = α·x[i] + (1-α)·y[i-1] |
s.CumSum() // [1, 3, 6, 10, 15]
s.CumProd() // [1, 2, 6, 24, 120]
s.CumMax() // running maximum
s.CumMin() // running minimumNaN propagates: once a NaN appears, all subsequent values are also NaN.
s := series.New([]float64{10, 12, 15, 11}, series.Float, "close")
s.Diff(1) // [NaN, 2, 3, -4]
s.Diff(2) // [NaN, NaN, 5, -1]
s.PctChange(1) // [NaN, 0.20, 0.25, -0.267]PctChange divides by abs(prev), returning NaN when the previous value is 0.
NaN pairs are excluded. Returns NaN if fewer than 2 valid pairs exist.
x := series.New([]float64{1, 2, 3, 4, 5}, series.Float, "x")
y := series.New([]float64{2, 4, 6, 8, 10}, series.Float, "y")
corr := x.Corr(y) // 1.0 (Pearson)
cov := x.Cov(y) // 5.0 (sample covariance, ddof=1)Generic batch conversion using Go generics (Go 1.18+). Allocates directly without pool reuse, so the returned Series owns its memory safely.
// Generic — works with any source slice type
s := series.BatchConvert([]int{1, 2, 3}, series.Float, "values")
// Typed convenience helpers
s := series.BatchConvertInts([]int{1, 2, 3}, series.Float, "values")
s := series.BatchConvertFloats([]float64{1.5, 2.5}, series.String, "values")
s := series.BatchConvertStrings([]string{"1", "2", "invalid"}, series.Int, "values")
s := series.BatchConvertBools([]bool{true, false}, series.Int, "values")Conversion rules:
- Invalid string → NaN (e.g.
"abc"to Int) int/int64→time.Timeviatime.Unix(v, 0)string→time.Timerequires RFC3339 format; others become NaN
Categorical is a memory-efficient column type for low-cardinality string data
(country codes, status labels, enum-like columns). It uses dictionary encoding:
a sorted slice of unique strings plus a []int32 code array.
// Create from string slice
cat := series.NewCategorical([]string{"US", "UK", "US", "DE"}, "country")
// Convert from/to regular String Series
cat, err := series.CategoricalFromSeries(s)
s := cat.ToSeries()
// Inspect
cat.Len() // number of rows
cat.NCategories() // number of distinct values
cat.Categories() // sorted dictionary slice
cat.Get(i) // string value at row i
cat.IsNA(i) // true if row i is missing
// Frequency counts
counts := cat.ValueCounts() // map[string]int
// Modify
cat.AddCategory("FR") // extend dictionary
cat.SetValue(0, "FR") // set row value (must be in dictionary)
// Filter
filtered, err := cat.Filter([]bool{true, false, true, false})
// Memory estimate
bytes := cat.MemoryBytes()df.Shift(1) // shift all columns down by 1 row (NaN at top)
df.Shift(-2, "price") // shift "price" up by 2 rows (NaN at bottom)df2 := df.Assign("profit", func(d dataframe.DataFrame) series.Series {
rev := d.Col("revenue").Float()
cost := d.Col("cost").Float()
out := make([]float64, len(rev))
for i := range rev { out[i] = rev[i] - cost[i] }
return series.Floats(out)
})// "tags" column: "go,python" → two rows
df2 := df.Explode("tags")df.Query("age > 18")
df.Query("status == active")
df.Query("age >= 18 AND age <= 65")
df.Query("country in US,UK,CA")
df.Query("score > 0.5 OR label == good")Operators: ==, !=, >, >=, <, <=, in, not in.
Combine with AND / OR (case-insensitive). Column names containing
operator substrings (e.g. income, bandwidth) are handled correctly.
// wide → long (alias for Melt)
long := df.Stack([]string{"id"}, []string{"q1","q2","q3"}, "quarter", "value")
// long → wide
wide := df.Unstack([]string{"id"}, "quarter", "value")rg := df.Resample("date", dataframe.ResampleMonthly) // D/W/M/Y/H
monthly := rg.Aggregation(
[]dataframe.AggregationType{dataframe.Aggregation_SUM},
[]string{"revenue"},
)
// result has "period" column + aggregated columnsdf.CapplyParallel(f) // parallel column-wise apply
df.RapplyParallel(f) // parallel row-wise apply
groups.AggregationParallel(typs, colnames) // parallel GroupBy aggregation// Read
df := dataframe.ReadNDJSON(r)
// Write (NaN → null)
err := df.WriteNDJSON(w)df := dataframe.ReadXLSXFile("data.xlsx", dataframe.WithSheet("Sheet2"))// PostgreSQL ($1, $2, …)
err := df.WriteSQL(pgDB, "users",
dataframe.WithPlaceholderStyle(dataframe.SQLPlaceholderDollar))
// SQL Server (@p1, @p2, …)
err := df.WriteSQL(msDB, "users",
dataframe.WithPlaceholderStyle(dataframe.SQLPlaceholderAt))err := dataframe.ScanCSV(f, 1000, func(batch dataframe.DataFrame) error {
// process 1000-row batch
return nil
})MIT — see LICENSE.md