@@ -13,7 +13,9 @@ import (
1313 "github.com/apache/arrow/go/v17/arrow/array"
1414)
1515
16- type ProjectCSVLeaf struct {
16+ // TODO: change the leaf stuff to be called scans instead
17+
18+ type CSVSource struct {
1719 r * csv.Reader
1820 schema * arrow.Schema // columns to project as well as types to cast to
1921 colPosition map [string ]int
@@ -22,9 +24,9 @@ type ProjectCSVLeaf struct {
2224}
2325
2426// assume everything is on disk for now
25- func NewProjectCSVLeaf (source io.Reader ) (* ProjectCSVLeaf , error ) {
27+ func NewProjectCSVLeaf (source io.Reader ) (* CSVSource , error ) {
2628 r := csv .NewReader (source )
27- proj := & ProjectCSVLeaf {
29+ proj := & CSVSource {
2830 r : r ,
2931 colPosition : make (map [string ]int ),
3032 }
@@ -34,31 +36,32 @@ func NewProjectCSVLeaf(source io.Reader) (*ProjectCSVLeaf, error) {
3436 return proj , err
3537}
3638
37- func (pcsv * ProjectCSVLeaf ) Next (n uint64 ) (* operators.RecordBatch , error ) {
38- if pcsv .done {
39+ func (csvS * CSVSource ) Next (n uint64 ) (* operators.RecordBatch , error ) {
40+ if csvS .done {
3941 return nil , io .EOF
4042 }
4143
4244 // 1. Create builders
43- builders := pcsv .initBuilders ()
45+ builders := csvS .initBuilders ()
4446
4547 rowsRead := uint64 (0 )
4648
4749 // Process stored first row (from parseHeader) ---
48- if pcsv .firstDataRow != nil && rowsRead < n {
49- if err := pcsv .processRow (pcsv .firstDataRow , builders ); err != nil {
50+ if csvS .firstDataRow != nil && rowsRead < n {
51+ fmt .Printf ("First row: %v\n " , csvS .firstDataRow )
52+ if err := csvS .processRow (csvS .firstDataRow , builders ); err != nil {
5053 return nil , err
5154 }
52- pcsv .firstDataRow = nil // consume it once
55+ csvS .firstDataRow = nil // consume it once
5356 rowsRead ++
5457 }
5558
5659 // Stream remaining rows from CSV reader ---
5760 for rowsRead < n {
58- row , err := pcsv .r .Read ()
61+ row , err := csvS .r .Read ()
5962 if err == io .EOF {
6063 if rowsRead == 0 {
61- pcsv .done = true
64+ csvS .done = true
6265 return nil , io .EOF
6366 }
6467 break
@@ -68,24 +71,24 @@ func (pcsv *ProjectCSVLeaf) Next(n uint64) (*operators.RecordBatch, error) {
6871 }
6972
7073 // append to builders
71- if err := pcsv .processRow (row , builders ); err != nil {
74+ if err := csvS .processRow (row , builders ); err != nil {
7275 return nil , err
7376 }
7477
7578 rowsRead ++
7679 }
7780
7881 // Freeze into Arrow arrays
79- columns := pcsv .finalizeBuilders (builders )
82+ columns := csvS .finalizeBuilders (builders )
8083
8184 return & operators.RecordBatch {
82- Schema : pcsv .schema ,
85+ Schema : csvS .schema ,
8386 Columns : columns ,
8487 }, nil
8588}
8689
87- func (pcsv * ProjectCSVLeaf ) initBuilders () []array.Builder {
88- fields := pcsv .schema .Fields ()
90+ func (csvS * CSVSource ) initBuilders () []array.Builder {
91+ fields := csvS .schema .Fields ()
8992 builders := make ([]array.Builder , len (fields ))
9093
9194 for i , f := range fields {
@@ -94,14 +97,14 @@ func (pcsv *ProjectCSVLeaf) initBuilders() []array.Builder {
9497
9598 return builders
9699}
97- func (pcsv * ProjectCSVLeaf ) processRow (
100+ func (csvS * CSVSource ) processRow (
98101 content []string ,
99102 builders []array.Builder ,
100103) error {
101- fields := pcsv .schema .Fields ()
102-
104+ fields := csvS .schema .Fields ()
105+ fmt . Printf ( "content : %v \n " , content )
103106 for i , f := range fields {
104- colIdx := pcsv .colPosition [f .Name ]
107+ colIdx := csvS .colPosition [f .Name ]
105108 cell := content [colIdx ]
106109
107110 switch b := builders [i ].(type ) {
@@ -143,7 +146,7 @@ func (pcsv *ProjectCSVLeaf) processRow(
143146
144147 return nil
145148}
146- func (pcsv * ProjectCSVLeaf ) finalizeBuilders (builders []array.Builder ) []arrow.Array {
149+ func (csvS * CSVSource ) finalizeBuilders (builders []array.Builder ) []arrow.Array {
147150 columns := make ([]arrow.Array , len (builders ))
148151
149152 for i , b := range builders {
@@ -155,16 +158,16 @@ func (pcsv *ProjectCSVLeaf) finalizeBuilders(builders []array.Builder) []arrow.A
155158}
156159
157160// first call to csv.Reader
158- func (pscv * ProjectCSVLeaf ) parseHeader () (* arrow.Schema , error ) {
159- header , err := pscv .r .Read ()
161+ func (csvS * CSVSource ) parseHeader () (* arrow.Schema , error ) {
162+ header , err := csvS .r .Read ()
160163 if err != nil {
161164 return nil , err
162165 }
163- firstDataRow , err := pscv .r .Read ()
166+ firstDataRow , err := csvS .r .Read ()
164167 if err != nil {
165168 return nil , err
166169 }
167- pscv .firstDataRow = firstDataRow
170+ csvS .firstDataRow = firstDataRow
168171 newFields := make ([]arrow.Field , 0 , len (header ))
169172 for i , colName := range header {
170173 sampleValue := firstDataRow [i ]
@@ -173,7 +176,7 @@ func (pscv *ProjectCSVLeaf) parseHeader() (*arrow.Schema, error) {
173176 Type : parseDataType (sampleValue ),
174177 Nullable : true ,
175178 })
176- pscv .colPosition [colName ] = i
179+ csvS .colPosition [colName ] = i
177180 }
178181 return arrow .NewSchema (newFields , nil ), nil
179182}
0 commit comments