Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ module go.kenn.io/kit
go 1.26.3

require (
github.com/asg017/sqlite-vec-go-bindings v0.1.6
github.com/gofrs/flock v0.13.0
github.com/mattn/go-sqlite3 v1.14.44
github.com/posthog/posthog-go v1.12.6
github.com/stretchr/testify v1.11.1
go.opentelemetry.io/otel v1.43.0
Expand All @@ -14,13 +16,15 @@ require (
golang.org/x/sys v0.44.0
golang.org/x/term v0.43.0
golang.org/x/tools v0.45.0
modernc.org/sqlite v1.53.0
)

require (
github.com/bitfield/gotestdox v0.2.2 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/dnephin/pflag v1.0.7 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/fatih/color v1.18.0 // indirect
github.com/fsnotify/fsnotify v1.9.0 // indirect
github.com/go-logr/logr v1.4.3 // indirect
Expand All @@ -32,14 +36,19 @@ require (
github.com/klauspost/compress v1.18.6 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/ncruces/go-strftime v1.0.0 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
go.opentelemetry.io/otel/sdk v1.43.0 // indirect
go.opentelemetry.io/otel/trace v1.43.0 // indirect
go.uber.org/nilaway v0.0.0-20260528182042-490362de4fb6 // indirect
golang.org/x/text v0.17.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
gotest.tools/gotestsum v1.13.0 // indirect
modernc.org/libc v1.73.4 // indirect
modernc.org/mathutil v1.7.1 // indirect
modernc.org/memory v1.11.0 // indirect
)

tool (
Expand Down
40 changes: 40 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
github.com/asg017/sqlite-vec-go-bindings v0.1.6 h1:Nx0jAzyS38XpkKznJ9xQjFXz2X9tI7KqjwVxV8RNoww=
github.com/asg017/sqlite-vec-go-bindings v0.1.6/go.mod h1:A8+cTt/nKFsYCQF6OgzSNpKZrzNo5gQsXBTfsXHXY0Q=
github.com/bitfield/gotestdox v0.2.2 h1:x6RcPAbBbErKLnapz1QeAlf3ospg8efBsedU93CDsnE=
github.com/bitfield/gotestdox v0.2.2/go.mod h1:D+gwtS0urjBrzguAkTM2wodsTQYFHdpx8eqRJ3N+9pY=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
Expand All @@ -6,6 +8,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dnephin/pflag v1.0.7 h1:oxONGlWxhmUct0YzKTgrpQv9AUA1wtPBn7zuSjJqptk=
github.com/dnephin/pflag v1.0.7/go.mod h1:uxE91IoWURlOiTUIA8Mq5ZZkAv3dPUfZNaT80Zm7OQE=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM=
github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU=
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
Expand All @@ -21,6 +25,8 @@ github.com/gofrs/flock v0.13.0 h1:95JolYOvGMqeH31+FC7D2+uULf6mG61mEZ/A8dRYMzw=
github.com/gofrs/flock v0.13.0/go.mod h1:jxeyy9R1auM5S6JYDBhDt+E2TCo7DkratH4Pgi8P+Z0=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4=
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
Expand All @@ -38,10 +44,16 @@ github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovk
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-sqlite3 v1.14.44 h1:3VSe+xafpbzsLbdr2AWlAZk9yRHiBhTBakioXaCKTF8=
github.com/mattn/go-sqlite3 v1.14.44/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ=
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/posthog/posthog-go v1.12.6 h1:N+FrKWY6DOuDhV2OMgvtKAKDYGTdtS9/nuvr0BTyBp0=
github.com/posthog/posthog-go v1.12.6/go.mod h1:xsVOW9YImilUcazwPNEq4PJDqEZf2KeCS758zXjwkPg=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
Expand Down Expand Up @@ -87,3 +99,31 @@ gotest.tools/gotestsum v1.13.0 h1:+Lh454O9mu9AMG1APV4o0y7oDYKyik/3kBOiCqiEpRo=
gotest.tools/gotestsum v1.13.0/go.mod h1:7f0NS5hFb0dWr4NtcsAsF0y1kzjEFfAil0HiBQJE03Q=
gotest.tools/v3 v3.5.2 h1:7koQfIKdy+I8UTetycgUqXWSDwpgv193Ka+qRsmBY8Q=
gotest.tools/v3 v3.5.2/go.mod h1:LtdLGcnqToBH83WByAAi/wiwSFCArdFIUV/xxN4pcjA=
modernc.org/cc/v4 v4.28.4 h1:Hd/4Es+MBj+/7hSdZaisNyu6bv3V0Dp2MdllyfqaH+c=
modernc.org/cc/v4 v4.28.4/go.mod h1:OnovgIhbbMXMu1aISnJ0wvVD1KnW+cAUJkIrAWh+kVI=
modernc.org/ccgo/v4 v4.34.4 h1:OVnSOWQjVKOYkFxoHYB+qQmSHK5gqMqARM+K9DpR/Ws=
modernc.org/ccgo/v4 v4.34.4/go.mod h1:qdKqE8FNIYyysougB1RX9MxCzp5oJOcQXSobANJ4TuE=
modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
modernc.org/gc/v3 v3.1.3 h1:6QAplYyVO+KdPW3pGnqmJDUxtkec8ooEWvks/hhU3lc=
modernc.org/gc/v3 v3.1.3/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
modernc.org/libc v1.73.4 h1:+ra4Ui8ngyt8HDcO1FTDPWlkAh6yOdaO2yAoh8MddQA=
modernc.org/libc v1.73.4/go.mod h1:DXZ3eO8qMCNn2SnmTNCiC71nJ9Rcq3PsnpU6Vc4rWK8=
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
modernc.org/opt v0.2.0 h1:tGyef5ApycA7FSEOMraay9SaTk5zmbx7Tu+cJs4QKZg=
modernc.org/opt v0.2.0/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
modernc.org/sqlite v1.53.0 h1:20WG8N9q4ji/dEqGk4uiI0c6OPjSeLTNYGFCc3+7c1M=
modernc.org/sqlite v1.53.0/go.mod h1:xoEpOIpGrgT48H5iiyt/YXPCZPEzlfmfFwtk8Lklw8s=
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
41 changes: 41 additions & 0 deletions vector/AGENTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# vector package invariants

`go.kenn.io/kit/vector` owns the backend-neutral parts of an embedding
pipeline. Preserve these invariants when changing it.

## The storage boundary is the point of this package

- The core `vector` package must not import `database/sql`, a driver, or
any backend client, and must not construct backend SQL. The `Fill` and
`Search` flows reach storage only through the `Store[K, G]` interface.
- Persistence is a function of the caller's source system. Backends live
in their own subpackages (e.g. `vector/sqlitevec`) so a caller wiring
one backend never pulls another backend's driver. New backends
(pgvector, duckdb) go in sibling subpackages, not into the core.
- Backends own query construction. The differences between sqlite-vec
`vec0 MATCH`, pgvector `<=>`, and duckdb `array_distance` belong behind
`QueryGeneration`, never in the core flows.

## Keys and generations are opaque

- Document identity is the caller's type `K` and generation identity its
type `G`. msgvault uses `int64`; kata uses UUIDs. Compare them for
equality only; never assume a type, a single id namespace, or an
ordering. Backends additionally require `K`/`G` to be types
`database/sql` can bind and scan.

## Merge semantics

- `Merge` takes per-generation lists in descending preference and keeps
the earliest list's hit on overlap (prefer the newer generation during
a migration). Coverage is a union — never drop a document that only one
generation covers, and never emit duplicates.
- Cross-generation scores are not comparable. Default to
`MergeNormalizedScore`; raw-score merging is opt-in.

## Generations during migration

- The mid-migration union exists because new documents land only in the
building generation while the active generation still serves the bulk.
`Search` must keep querying every generation `LiveGenerations` returns,
in the order it returns them.
48 changes: 48 additions & 0 deletions vector/chunk.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package vector

// Chunk is a window of text encoded as a single vector. Index is the
// chunk's position within the source content, starting at zero.
type Chunk struct {
Index int
Text string
}

// SplitOptions controls how Split windows content into chunks.
type SplitOptions struct {
// MaxRunes bounds the number of runes in each chunk. Values <= 0
// disable splitting and return the content as a single chunk.
MaxRunes int
// Overlap is the number of runes shared between consecutive chunks.
// It is clamped to the range [0, MaxRunes-1].
Overlap int
}

// Split windows content into overlapping chunks of at most MaxRunes runes.
// It splits on runes rather than bytes so multi-byte characters are never
// torn apart. Empty content yields no chunks.
//
// Split measures size in runes, not model tokens. Callers that budget by
// tokens should convert their token budget to an approximate rune count.
func Split(content string, o SplitOptions) []Chunk {
if content == "" {
return nil
}
runes := []rune(content)
if o.MaxRunes <= 0 || len(runes) <= o.MaxRunes {
return []Chunk{{Index: 0, Text: content}}
}

overlap := min(max(o.Overlap, 0), o.MaxRunes-1)
stride := o.MaxRunes - overlap

var chunks []Chunk
for start, idx := 0, 0; start < len(runes); start, idx = start+stride, idx+1 {
end := start + o.MaxRunes
if end >= len(runes) {
chunks = append(chunks, Chunk{Index: idx, Text: string(runes[start:])})
break
}
chunks = append(chunks, Chunk{Index: idx, Text: string(runes[start:end])})
}
return chunks
}
84 changes: 84 additions & 0 deletions vector/chunk_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package vector_test

import (
"testing"

"github.com/stretchr/testify/assert"

"go.kenn.io/kit/vector"
)

func TestSplit(t *testing.T) {
tests := []struct {
name string
content string
opts vector.SplitOptions
want []vector.Chunk
}{
{
name: "empty yields no chunks",
content: "",
opts: vector.SplitOptions{MaxRunes: 4},
want: nil,
},
{
name: "non-positive max returns single chunk",
content: "hello world",
opts: vector.SplitOptions{MaxRunes: 0},
want: []vector.Chunk{{Index: 0, Text: "hello world"}},
},
{
name: "content shorter than max is one chunk",
content: "abcd",
opts: vector.SplitOptions{MaxRunes: 8},
want: []vector.Chunk{{Index: 0, Text: "abcd"}},
},
{
name: "windows without overlap",
content: "abcdefghij",
opts: vector.SplitOptions{MaxRunes: 5},
want: []vector.Chunk{
{Index: 0, Text: "abcde"},
{Index: 1, Text: "fghij"},
},
},
{
name: "windows with overlap",
content: "abcdefghij",
opts: vector.SplitOptions{MaxRunes: 4, Overlap: 1},
want: []vector.Chunk{
{Index: 0, Text: "abcd"},
{Index: 1, Text: "defg"},
{Index: 2, Text: "ghij"},
},
},
{
name: "overlap at or above max clamps to max-1",
content: "abcdef",
opts: vector.SplitOptions{MaxRunes: 3, Overlap: 9},
want: []vector.Chunk{
{Index: 0, Text: "abc"},
{Index: 1, Text: "bcd"},
{Index: 2, Text: "cde"},
{Index: 3, Text: "def"},
},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.want, vector.Split(tt.content, tt.opts))
})
}
}

func TestSplitDoesNotTearMultiByteRunes(t *testing.T) {
assert := assert.New(t)
// Each emoji is multiple bytes but one rune.
chunks := vector.Split("😀😁😂🤣", vector.SplitOptions{MaxRunes: 2})

assert.Equal([]vector.Chunk{
{Index: 0, Text: "😀😁"},
{Index: 1, Text: "😂🤣"},
}, chunks)
}
24 changes: 24 additions & 0 deletions vector/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Package vector provides backend-neutral building blocks for embedding
// content and searching the resulting vectors.
//
// It is organized in three layers:
//
// - Transforms and value types: Split windows content into chunks,
// Generation identifies an embedding model, EncodeBatched batches
// encode calls, and RollupByDocument and Merge reduce and combine
// search results across generations. These are pure functions.
//
// - The Store contract: Store[K, G] is the persistence interface the
// flows depend on. Implementations are a function of the caller's
// source system and own all backend SQL and query construction; see
// the sqlitevec subpackage for a worked example.
//
// - Flows: Fill runs the scan-and-fill embedding loop and Search runs
// the cross-generation query-and-merge, both over a Store.
//
// Nothing in this package opens a database, holds an index, or constructs
// backend SQL — the flows delegate every storage operation to the Store.
// Document identity is the caller's own key type K, and generation
// identity its type G; the package compares both for equality but never
// interprets them.
package vector
Loading
Loading