Skip to content

Commit f35fec3

Browse files
committed
io.Reader record factory
1 parent f9a3fd4 commit f35fec3

File tree

6 files changed

+282
-82
lines changed

6 files changed

+282
-82
lines changed

README.md

+40-17
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,18 @@ The goal is to provide a useful toolkit to make it easier to use Arrow, and by e
88
Bodkin enables you to use your _data_ to define and evolve your Arrow Schema.
99

1010
## Features
11-
11+
### Arrow schema generation from data type inference
1212
- Converts a structured input (json string or []byte, Go struct or map[string]any) into an Apache Arrow schema
13-
- Supports nested types
13+
- Supports nested types
1414
- Automatically evolves the Arrow schema with new fields when providing new inputs
15+
- Option to merge new infered schema at existing path for composibility
1516
- Converts schema field types when unifying schemas to accept evolving input data
1617
- Tracks changes to the schema
17-
- Export/import a serialized Arrow schema to/from file or []byte to transmit or persist schema definition
18-
- Custom data loader to load structured data directly to Arrow Records based on inferred schema
18+
- Export/import a serialized Arrow schema to/from file or `[]byte` to transmit or persist schema definition
19+
### Custom data loader
20+
- Load structured data directly to Arrow Records based on inferred schema
21+
- Individual input to Arrow Record
22+
- io.Reader stream to Arrow Records
1923

2024
## 🚀 Install
2125

@@ -138,11 +142,41 @@ Also works with nested Go structs and slices
138142
// - Age: type=int32, nullable
139143
```
140144

145+
Export your schema to a file, then import the file to retrieve the schema; or export/import to/from a []byte.
146+
```go
147+
_ = u.ExportSchemaFile("./test.schema")
148+
imp, _ := u.ImportSchemaFile("./test.schema")
149+
fmt.Printf("imported %v\n", imp.String())
150+
151+
bs, _ := u.ExportSchemaBytes()
152+
sc, _ := u.ImportSchemaBytes(bs)
153+
fmt.Printf("imported %v\n", sc.String())
154+
```
155+
141156
Use a Bodkin Reader to load data to Arrow Records
142157
```go
143158
u := bodkin.NewBodkin(bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion())
144-
r, _ := u.NewReader()
145-
rec, _ := r.ReadToRecord([]byte(jsonS1))
159+
u.Unify(jsonS1) // feed data for schema generation
160+
rdr, _ := u.NewReader() // infered schema in Bodkin used to create Reader
161+
rec, _ := rdr.ReadToRecord([]byte(jsonS1)) // Reader loads data and returns Arrow Record
162+
```
163+
164+
Provide a Bodkin Reader with an io.Reader to load many records
165+
```go
166+
import "github.com/loicalleyne/bodkin/reader"
167+
...
168+
u := bodkin.NewBodkin(bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion())
169+
// Create Reader attached to Bodkin ...
170+
u.NewReader(schema, 0, reader.WithIOReader(ff, reader.DefaultDelimiter), reader.WithChunk(1024))
171+
for u.Reader.Next(){
172+
rec := r.Record()
173+
}
174+
// or create a stand-alone Reader if you have an existing *arrow.Schema
175+
rdr, _ := reader.NewReader(schema, 0, reader.WithIOReader(ff, reader.DefaultDelimiter), reader.WithChunk(1024))
176+
for rdr.Next() {
177+
rec := r.Record()
178+
...
179+
}
146180
```
147181

148182
Use the generated Arrow schema with Arrow's built-in JSON reader to decode JSON data into Arrow records
@@ -159,17 +193,6 @@ for rdr.Next() {
159193
// ]
160194
```
161195

162-
Export your schema to a file, then import the file to retrieve the schema; or export/import to/from a []byte.
163-
```go
164-
_ = u.ExportSchemaFile("./test.schema")
165-
imp, _ := u.ImportSchemaFile("./test.schema")
166-
fmt.Printf("imported %v\n", imp.String())
167-
168-
bs, _ := u.ExportSchemaBytes()
169-
sc, _ := u.ImportSchemaBytes(bs)
170-
fmt.Printf("imported %v\n", sc.String())
171-
```
172-
173196
## 💫 Show your support
174197

175198
Give a ⭐️ if this project helped you!

bodkin.go

+28-16
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,6 @@ type (
2626
config *Bodkin
2727
)
2828

29-
type (
30-
ReaderOption func(reader.Option)
31-
readerConfig *reader.DataReader
32-
)
33-
3429
// Field represents an element in the input data.
3530
type Field struct {
3631
Dotpath string `json:"dotpath"`
@@ -49,12 +44,13 @@ const (
4944
// Bodkin is a collection of field paths, describing the columns of a structured input(s).
5045
type Bodkin struct {
5146
rr io.Reader
52-
sf bufio.SplitFunc
53-
sc *bufio.Scanner
47+
br *bufio.Reader
48+
delim byte
5449
original *fieldPos
5550
old *fieldPos
5651
new *fieldPos
57-
r *reader.DataReader
52+
opts []Option
53+
Reader *reader.DataReader
5854
knownFields *omap.OrderedMap[string, *fieldPos]
5955
untypedFields *omap.OrderedMap[string, *fieldPos]
6056
unificationCount int
@@ -66,6 +62,8 @@ type Bodkin struct {
6662
changes error
6763
}
6864

65+
func (u *Bodkin) Opts() []Option { return u.opts }
66+
6967
func (u *Bodkin) NewReader(opts ...reader.Option) (*reader.DataReader, error) {
7068
schema, err := u.Schema()
7169
if err != nil {
@@ -74,11 +72,11 @@ func (u *Bodkin) NewReader(opts ...reader.Option) (*reader.DataReader, error) {
7472
if schema == nil {
7573
return nil, fmt.Errorf("nil schema")
7674
}
77-
r, err := reader.NewReader(schema, 0, opts...)
75+
u.Reader, err = reader.NewReader(schema, 0, opts...)
7876
if err != nil {
7977
return nil, err
8078
}
81-
return r, nil
79+
return u.Reader, nil
8280
}
8381

8482
// NewBodkin returns a new Bodkin value from a structured input.
@@ -91,6 +89,7 @@ func NewBodkin(opts ...Option) *Bodkin {
9189

9290
func newBodkin(opts ...Option) *Bodkin {
9391
b := &Bodkin{}
92+
b.opts = opts
9493
for _, opt := range opts {
9594
opt(b)
9695
}
@@ -182,7 +181,7 @@ func (u *Bodkin) ExportSchemaFile(exportPath string) error {
182181
return err
183182
}
184183
bs := flight.SerializeSchema(schema, memory.DefaultAllocator)
185-
err = os.WriteFile("./temp.schema", bs, 0644)
184+
err = os.WriteFile(exportPath, bs, 0644)
186185
if err != nil {
187186
return err
188187
}
@@ -261,17 +260,23 @@ func (u *Bodkin) UnifyScan() error {
261260
}
262261
return u.err
263262
}()
264-
for u.sc.Scan() {
265-
m, err := reader.InputMap(u.sc.Bytes())
263+
for {
264+
datumBytes, err := u.br.ReadBytes(u.delim)
265+
if err != nil {
266+
if errors.Is(err, io.EOF) {
267+
u.err = nil
268+
break
269+
}
270+
u.err = err
271+
break
272+
}
273+
m, err := reader.InputMap(datumBytes)
266274
if err != nil {
267275
u.err = errors.Join(u.err, err)
268276
continue
269277
}
270278
u.Unify(m)
271279
}
272-
if err := u.sc.Err(); err != nil {
273-
u.err = errors.Join(u.err, err)
274-
}
275280
return u.err
276281
}
277282

@@ -333,6 +338,8 @@ func (u *Bodkin) OriginSchema() (*arrow.Schema, error) {
333338

334339
// Schema returns the current merged Arrow schema generated from the structure/types of
335340
// the input(s), and a panic recovery error if the schema could not be created.
341+
// If the Bodkin has a Reader and the schema has been updated since its creation, the Reader
342+
// will replaced with a new one matching the current schema. Any
336343
func (u *Bodkin) Schema() (*arrow.Schema, error) {
337344
if u.old == nil {
338345
return nil, fmt.Errorf("bodkin not initialised")
@@ -349,6 +356,11 @@ func (u *Bodkin) Schema() (*arrow.Schema, error) {
349356
fields = append(fields, c.field)
350357
}
351358
s = arrow.NewSchema(fields, nil)
359+
if u.Reader != nil {
360+
if !u.Reader.Schema().Equal(s) {
361+
u.Reader, _ = reader.NewReader(s, 0, u.Reader.Opts()...)
362+
}
363+
}
352364
return s, nil
353365
}
354366

cmd/main.go

+46-40
Original file line numberDiff line numberDiff line change
@@ -8,62 +8,68 @@ import (
88
"time"
99

1010
"github.com/loicalleyne/bodkin"
11+
"github.com/loicalleyne/bodkin/reader"
1112
)
1213

1314
func main() {
1415
start := time.Now()
15-
filepath := "github.json"
16+
filepath := "large-file.json"
1617
log.Println("start")
17-
f, err := os.Open(filepath)
18-
if err != nil {
19-
panic(err)
20-
}
21-
defer f.Close()
22-
s := bufio.NewScanner(f)
23-
u := bodkin.NewBodkin(bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion())
24-
if err != nil {
25-
panic(err)
26-
}
27-
28-
for s.Scan() {
29-
err = u.Unify(s.Bytes())
18+
var u *bodkin.Bodkin
19+
if 1 == 1 {
20+
f, err := os.Open(filepath)
21+
if err != nil {
22+
panic(err)
23+
}
24+
defer f.Close()
25+
s := bufio.NewScanner(f)
26+
u = bodkin.NewBodkin(bodkin.WithInferTimeUnits(), bodkin.WithTypeConversion())
3027
if err != nil {
3128
panic(err)
3229
}
33-
}
34-
f.Close()
35-
schema, err := u.Schema()
36-
if err != nil {
37-
panic(err)
38-
}
39-
log.Printf("union %v\n", schema.String())
40-
log.Printf("elapsed: %v\n", time.Since(start))
4130

42-
ff, err := os.Open(filepath)
43-
if err != nil {
44-
panic(err)
45-
}
46-
defer ff.Close()
47-
r, err := u.NewReader()
48-
if err != nil {
49-
panic(err)
31+
for s.Scan() {
32+
err = u.Unify(s.Bytes())
33+
if err != nil {
34+
panic(err)
35+
}
36+
}
37+
f.Close()
38+
err = u.ExportSchemaFile("temp.bak")
39+
if err != nil {
40+
panic(err)
41+
}
5042
}
51-
i := 0
52-
s = bufio.NewScanner(ff)
53-
for s.Scan() {
54-
rec, err := r.ReadToRecord(s.Bytes())
43+
if 1 == 1 {
44+
schema, err := u.ImportSchemaFile("temp.bak")
5545
if err != nil {
5646
panic(err)
5747
}
58-
_, err = rec.MarshalJSON()
48+
ff, err := os.Open(filepath)
5949
if err != nil {
60-
fmt.Printf("error marshaling record: %v\n", err)
50+
panic(err)
6151
}
62-
// fmt.Printf("\nmarshaled record :\n%v\n", string(rj))
63-
i++
64-
}
52+
defer ff.Close()
53+
r, err := reader.NewReader(schema, 0, reader.WithIOReader(ff, reader.DefaultDelimiter), reader.WithChunk(1024*16))
54+
if err != nil {
55+
panic(err)
56+
}
57+
58+
log.Printf("union %v\n", schema.String())
59+
log.Printf("elapsed: %v\n", time.Since(start))
6560

66-
log.Println("records", i)
61+
i := 0
62+
for r.Next() {
63+
rec := r.Record()
64+
_, err := rec.MarshalJSON()
65+
if err != nil {
66+
fmt.Printf("error marshaling record: %v\n", err)
67+
}
68+
// fmt.Printf("\nmarshaled record :\n%v\n", string(rj))
69+
i++
70+
}
71+
log.Println("records", r.Count(), i)
72+
}
6773
log.Printf("elapsed: %v\n", time.Since(start))
6874
log.Println("end")
6975
}

option.go

+7-6
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,15 @@ func WithMaxCount(i int) Option {
4545
}
4646
}
4747

48-
// WithIOReader provides an io.Reader for a Bodkin to use with UnifyScan().
49-
// A bufio.SplitFunc can optionally be provided, otherwise the default
50-
// ScanLines will be used.
51-
func WithIOReader(r io.Reader, sf bufio.SplitFunc) Option {
48+
// WithIOReader provides an io.Reader for a Bodkin to use with UnifyScan(), along
49+
// with a delimiter to use to split datum in the data stream.
50+
// Default delimiter '\n' if delimiter is not provided.
51+
func WithIOReader(r io.Reader, delim byte) Option {
5252
return func(cfg config) {
5353
cfg.rr = r
54-
if sf != nil {
55-
cfg.sf = sf
54+
cfg.br = bufio.NewReaderSize(cfg.rr, 1024*16)
55+
if delim != '\n' {
56+
cfg.delim = delim
5657
}
5758
}
5859
}

reader/option.go

+41
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
package reader
22

33
import (
4+
"bufio"
5+
"io"
6+
47
"github.com/apache/arrow-go/v18/arrow/memory"
58
)
69

@@ -19,3 +22,41 @@ func WithJSONDecoder() Option {
1922
cfg.jsonDecode = true
2023
}
2124
}
25+
26+
// WithChunk specifies the chunk size used while reading data to Arrow records.
27+
//
28+
// If n is zero or 1, no chunking will take place and the reader will create
29+
// one record per row.
30+
// If n is greater than 1, chunks of n rows will be read.
31+
func WithChunk(n int) Option {
32+
return func(cfg config) {
33+
cfg.chunk = n
34+
}
35+
}
36+
37+
// WithIOReader provides an io.Reader to Bodkin Reader, along with a delimiter
38+
// to use to split datum in the data stream. Default delimiter '\n' if delimiter
39+
// is not provided.
40+
func WithIOReader(r io.Reader, delim byte) Option {
41+
return func(cfg config) {
42+
cfg.rr = r
43+
cfg.br = bufio.NewReaderSize(cfg.rr, 1024*1024*16)
44+
if delim != DefaultDelimiter {
45+
cfg.delim = delim
46+
}
47+
}
48+
}
49+
50+
// WithInputBufferSize specifies the Bodkin Reader's input buffer size.
51+
func WithInputBufferSize(n int) Option {
52+
return func(cfg config) {
53+
cfg.inputBufferSize = n
54+
}
55+
}
56+
57+
// WithRecordBufferSize specifies the Bodkin Reader's record buffer size.
58+
func WithRecordBufferSize(n int) Option {
59+
return func(cfg config) {
60+
cfg.recordBufferSize = n
61+
}
62+
}

0 commit comments

Comments
 (0)