diff --git a/.gitignore b/.gitignore index 18bb522..3970ccd 100644 --- a/.gitignore +++ b/.gitignore @@ -102,4 +102,11 @@ src/Backend/test_data/json # Allow s3_source directory !src/Backend/test_data/s3_source/ -!src/Backend/test_data/s3_source/** \ No newline at end of file +!src/Backend/test_data/s3_source/** + +# Allow a specific CSV dataset that we want tracked despite the general csv ignores +!src/Backend/test_data/csv/ +!src/Backend/test_data/csv/Mental_Health_and_Social_Media_Balance_Dataset.csv +# allow parquet file +!src/Backend/test_data/parquet/ +!src/Backend/test_data/parquet/capitals_clean.parquet \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7e5fd6c..d4f60d4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,6 +19,12 @@ We use a Makefile to simplify common development tasks. All commands should be r ```bash make go-test-coverage ``` +- Run test with html coverage + ```bash + go test ./... -coverprofile=coverage.out + go tool cover -html=coverage.out + ``` + ### Rust Tests - Run all tests diff --git a/Makefile b/Makefile index 0afcce1..045191c 100644 --- a/Makefile +++ b/Makefile @@ -29,7 +29,6 @@ go-test-coverage: @echo "Running Go tests with coverage..." cd src/Backend/opti-sql-go && go test -v -coverprofile=coverage.out ./... cd src/Backend/opti-sql-go && go tool cover -func=coverage.out - go-run: @echo "Running Go application..." cd src/Backend/opti-sql-go && go run main.go diff --git a/src/Backend/opti-sql-go/config/config.go b/src/Backend/opti-sql-go/config/config.go index d5943e6..627136b 100644 --- a/src/Backend/opti-sql-go/config/config.go +++ b/src/Backend/opti-sql-go/config/config.go @@ -16,10 +16,11 @@ var ( ) type Config struct { - Server serverConfig `yaml:"server"` - Batch batchConfig `yaml:"batch"` - Query queryConfig `yaml:"query"` - Metrics metricsConfig `yaml:"metrics"` + Server serverConfig `yaml:"server"` + Batch batchConfig `yaml:"batch"` + Query queryConfig `yaml:"query"` + Metrics metricsConfig `yaml:"metrics"` + Secretes secretesConfig // do not read these from yaml } type serverConfig struct { Port int `yaml:"port"` @@ -32,6 +33,8 @@ type batchConfig struct { EnableParallelRead bool `yaml:"enable_parallel_read"` MaxMemoryBeforeSpill uint64 `yaml:"max_memory_before_spill"` MaxFileSizeMB int `yaml:"max_file_size_mb"` // max size of a single file + ShouldDownload bool `yaml:"should_download"` + MaxDownloadSizeMB int `yaml:"max_download_size_mb"` // max size to download from external sources like S3 } type queryConfig struct { // should results be cached, server side? if so how long @@ -51,6 +54,12 @@ type metricsConfig struct { // memory usage over time EnableMemoryStats bool `yaml:"enable_memory_stats"` } +type secretesConfig struct { + AccessKey string `yaml:"access_key"` + SecretKey string `yaml:"secret_key"` + EndpointURL string `yaml:"endpoint_url"` + BucketName string `yaml:"bucket_name"` +} var configInstance *Config = &Config{ Server: serverConfig{ @@ -64,6 +73,10 @@ var configInstance *Config = &Config{ EnableParallelRead: true, MaxMemoryBeforeSpill: uint64(gigaByte) * 2, // 2GB MaxFileSizeMB: 500, // 500MB + // should we download files from external sources like S3 + // if so whats the max size to download, if its greater than dont download the file locally + ShouldDownload: true, + MaxDownloadSizeMB: 10, // 10MB }, Query: queryConfig{ EnableCache: true, @@ -79,6 +92,13 @@ var configInstance *Config = &Config{ EnableQueryStats: true, EnableMemoryStats: true, }, + // TODO: remove hardcoded secretes before production. we are just testing for now + Secretes: secretesConfig{ + AccessKey: "DO8013ZT6VDHJ2EM94RN", + SecretKey: "kPvQSMt6naiwe/FhDnzXpYmVE5yzJUsIR0/OJpsUNzo", + EndpointURL: "atl1.digitaloceanspaces.com", + BucketName: "test-bucket-pull-down", + }, } func GetConfig() *Config { @@ -138,6 +158,12 @@ func mergeConfig(dst *Config, src map[string]interface{}) { if v, ok := batch["max_file_size_mb"].(int); ok { dst.Batch.MaxFileSizeMB = v } + if v, ok := batch["should_download"].(bool); ok { + dst.Batch.ShouldDownload = v + } + if v, ok := batch["max_download_size_mb"].(int); ok { + dst.Batch.MaxDownloadSizeMB = v + } } // ============================= diff --git a/src/Backend/opti-sql-go/go.mod b/src/Backend/opti-sql-go/go.mod index 49182e3..184caaa 100644 --- a/src/Backend/opti-sql-go/go.mod +++ b/src/Backend/opti-sql-go/go.mod @@ -3,17 +3,45 @@ module opti-sql-go go 1.24.0 require ( + github.com/apache/arrow/go/v15 v15.0.2 github.com/apache/arrow/go/v17 v17.0.0 + github.com/aws/aws-sdk-go v1.55.8 + github.com/aws/aws-sdk-go-v2 v1.39.6 + github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2 + github.com/joho/godotenv v1.5.1 google.golang.org/grpc v1.63.2 google.golang.org/protobuf v1.34.2 gopkg.in/yaml.v3 v3.0.1 ) require ( + github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c // indirect + github.com/andybalholm/brotli v1.1.0 // indirect + github.com/apache/thrift v0.20.0 // indirect + github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13 // indirect + github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13 // indirect + github.com/aws/smithy-go v1.23.2 // indirect + github.com/go-ini/ini v1.67.0 // indirect github.com/goccy/go-json v0.10.3 // indirect + github.com/golang/snappy v0.0.4 // indirect github.com/google/flatbuffers v24.3.25+incompatible // indirect + github.com/jmespath/go-jmespath v0.4.0 // indirect + github.com/klauspost/asmfmt v1.3.2 // indirect + github.com/klauspost/compress v1.17.9 // indirect github.com/klauspost/cpuid/v2 v2.2.8 // indirect + github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 // indirect + github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 // indirect + github.com/minio/minio-go v6.0.14+incompatible // indirect + github.com/mitchellh/go-homedir v1.1.0 // indirect + github.com/pierrec/lz4/v4 v4.1.21 // indirect github.com/zeebo/xxh3 v1.0.2 // indirect + golang.org/x/crypto v0.24.0 // indirect golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 // indirect golang.org/x/mod v0.18.0 // indirect golang.org/x/net v0.26.0 // indirect diff --git a/src/Backend/opti-sql-go/go.sum b/src/Backend/opti-sql-go/go.sum index 8839c2d..9c4220d 100644 --- a/src/Backend/opti-sql-go/go.sum +++ b/src/Backend/opti-sql-go/go.sum @@ -1,27 +1,82 @@ +github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c h1:RGWPOewvKIROun94nF7v2cua9qP+thov/7M50KEoeSU= +github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c/go.mod h1:X0CRv0ky0k6m906ixxpzmDRLvX58TFUKS2eePweuyxk= +github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= +github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= +github.com/apache/arrow/go/v15 v15.0.2 h1:60IliRbiyTWCWjERBCkO1W4Qun9svcYoZrSLcyOsMLE= +github.com/apache/arrow/go/v15 v15.0.2/go.mod h1:DGXsR3ajT524njufqf95822i+KTh+yea1jass9YXgjA= github.com/apache/arrow/go/v17 v17.0.0 h1:RRR2bdqKcdbss9Gxy2NS/hK8i4LDMh23L6BbkN5+F54= github.com/apache/arrow/go/v17 v17.0.0/go.mod h1:jR7QHkODl15PfYyjM2nU+yTLScZ/qfj7OSUZmJ8putc= +github.com/apache/thrift v0.20.0 h1:631+KvYbsBZxmuJjYwhezVsrfc/TbqtZV4QcxOX1fOI= +github.com/apache/thrift v0.20.0/go.mod h1:hOk1BQqcp2OLzGsyVXdfMk7YFlMxK3aoEVhjD06QhB8= +github.com/aws/aws-sdk-go v1.55.8 h1:JRmEUbU52aJQZ2AjX4q4Wu7t4uZjOu71uyNmaWlUkJQ= +github.com/aws/aws-sdk-go v1.55.8/go.mod h1:ZkViS9AqA6otK+JBBNH2++sx1sgxrPKcSzPPvQkUtXk= +github.com/aws/aws-sdk-go-v2 v1.39.6 h1:2JrPCVgWJm7bm83BDwY5z8ietmeJUbh3O2ACnn+Xsqk= +github.com/aws/aws-sdk-go-v2 v1.39.6/go.mod h1:c9pm7VwuW0UPxAEYGyTmyurVcNrbF6Rt/wixFqDhcjE= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3 h1:DHctwEM8P8iTXFxC/QK0MRjwEpWQeM9yzidCRjldUz0= +github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.3/go.mod h1:xdCzcZEtnSTKVDOmUZs4l/j3pSV6rpo1WXl5ugNsL8Y= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13 h1:a+8/MLcWlIxo1lF9xaGt3J/u3yOZx+CdSveSNwjhD40= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.13/go.mod h1:oGnKwIYZ4XttyU2JWxFrwvhF6YKiK/9/wmE3v3Iu9K8= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13 h1:HBSI2kDkMdWz4ZM7FjwE7e/pWDEZ+nR95x8Ztet1ooY= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.13/go.mod h1:YE94ZoDArI7awZqJzBAZ3PDD2zSfuP7w6P2knOzIn8M= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13 h1:eg/WYAa12vqTphzIdWMzqYRVKKnCboVPRlvaybNCqPA= +github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.13/go.mod h1:/FDdxWhz1486obGrKKC1HONd7krpk38LBt+dutLcN9k= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3 h1:x2Ibm/Af8Fi+BH+Hsn9TXGdT+hKbDd5XOTZxTMxDk7o= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.3/go.mod h1:IW1jwyrQgMdhisceG8fQLmQIydcT/jWY21rFhzgaKwo= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4 h1:NvMjwvv8hpGUILarKw7Z4Q0w1H9anXKsesMxtw++MA4= +github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.4/go.mod h1:455WPHSwaGj2waRSpQp7TsnpOnBfw8iDfPfbwl7KPJE= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13 h1:kDqdFvMY4AtKoACfzIGD8A0+hbT41KTKF//gq7jITfM= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.13/go.mod h1:lmKuogqSU3HzQCwZ9ZtcqOc5XGMqtDK7OIc2+DxiUEg= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13 h1:zhBJXdhWIFZ1acfDYIhu4+LCzdUS2Vbcum7D01dXlHQ= +github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.13/go.mod h1:JaaOeCE368qn2Hzi3sEzY6FgAZVCIYcC2nwbro2QCh8= +github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2 h1:DhdbtDl4FdNlj31+xiRXANxEE+eC7n8JQz+/ilwQ8Uc= +github.com/aws/aws-sdk-go-v2/service/s3 v1.90.2/go.mod h1:+wArOOrcHUevqdto9k1tKOF5++YTe9JEcPSc9Tx2ZSw= +github.com/aws/smithy-go v1.23.2 h1:Crv0eatJUQhaManss33hS5r40CG3ZFH+21XSkqMrIUM= +github.com/aws/smithy-go v1.23.2/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-ini/ini v1.67.0 h1:z6ZrTEZqSWOTyH2FlglNbNgARyHG8oLW9gMELqKr06A= +github.com/go-ini/ini v1.67.0/go.mod h1:ByCAeIL28uOIIG0E3PJtZPDL8WnHpFKFOtgjp+3Ies8= github.com/goccy/go-json v0.10.3 h1:KZ5WoDbxAIgm2HNbYckL0se1fHD6rz5j4ywS6ebzDqA= github.com/goccy/go-json v0.10.3/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/flatbuffers v24.3.25+incompatible h1:CX395cjN9Kke9mmalRoL3d81AtFUxJM+yDthflgJGkI= github.com/google/flatbuffers v24.3.25+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= +github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= +github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= +github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= +github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= +github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4= +github.com/klauspost/asmfmt v1.3.2/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE= github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= github.com/klauspost/cpuid/v2 v2.2.8 h1:+StwCXwm9PdpiEkPyzBXIy+M9KUb4ODm0Zarf1kS5BM= github.com/klauspost/cpuid/v2 v2.2.8/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= +github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs= +github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY= +github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI= +github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE= +github.com/minio/minio-go v6.0.14+incompatible h1:fnV+GD28LeqdN6vT2XdGKW8Qe/IfjJDswNVuni6km9o= +github.com/minio/minio-go v6.0.14+incompatible/go.mod h1:7guKYtitv8dktvNUGrhzmNlA5wrAABTQXCoesZdFQO8= +github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= +github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= +golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI= +golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 h1:LfspQV/FYTatPTr/3HzIcmiUFH7PGP+OQ6mgDYo3yuQ= golang.org/x/exp v0.0.0-20240222234643-814bf88cf225/go.mod h1:CxmFvTBINI24O/j8iY7H1xHzx2i4OsyguNBmN/uPtqc= golang.org/x/mod v0.18.0 h1:5+9lSbEzPSdWkH32vYPBwEpX8KwDbM52Ud9xBUvNlb0= @@ -48,5 +103,6 @@ google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDom google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/src/Backend/opti-sql-go/main.go b/src/Backend/opti-sql-go/main.go index 82e1eb8..f277de6 100644 --- a/src/Backend/opti-sql-go/main.go +++ b/src/Backend/opti-sql-go/main.go @@ -6,6 +6,8 @@ import ( "os" ) +// TODO: in the project operators make sure the record batches account for the RowCount field properly. + func main() { if len(os.Args) > 1 { if err := config.Decode(os.Args[1]); err != nil { diff --git a/src/Backend/opti-sql-go/operators/Expr/expr.go b/src/Backend/opti-sql-go/operators/Expr/expr.go index 25c8d3c..5334beb 100644 --- a/src/Backend/opti-sql-go/operators/Expr/expr.go +++ b/src/Backend/opti-sql-go/operators/Expr/expr.go @@ -4,3 +4,21 @@ package Expr // for example Column + Literal // Column - Column // Literal / Literal + +//1. Arithmetic Expressions +// SELECT salary * 1.2, price + tax, -(discount) +//2.Alias Expressions +//SELECT name AS employee_name, age AS employee_age +//SELECT salary * 1.2 AS new_salary +//3.String Expressions +//first_name || ' ' || last_name +//UPPER(name) +//LOWER(email) +//SUBSTRING(name, 1, 3) +//4. Function calls +//ABS(x) +//ROUND(salary, 2) +//LENGTH(name) +//COALESCE(a, b) +//5. Constants +//SELECT 1, 'hello', 3.14 diff --git a/src/Backend/opti-sql-go/operators/filter/filter.go b/src/Backend/opti-sql-go/operators/filter/filter.go index 32326a3..4195cdd 100644 --- a/src/Backend/opti-sql-go/operators/filter/filter.go +++ b/src/Backend/opti-sql-go/operators/filter/filter.go @@ -1,7 +1,24 @@ package filter -// handle Bitwise operations here as well +import ( + "github.com/apache/arrow/go/v17/arrow" + "github.com/apache/arrow/go/v17/arrow/array" +) -// OR -// AND -// NOT +// FilterExpr takes in a field and column and yeilds a function that takes in an index and returns a bool indicating whether the row at that index satisfies the filter condition. +type FilterExpr func(filed arrow.Field, col arrow.Array) func(i int) bool + +// example +func ExampleFilterExpr(field arrow.Field, col arrow.Array) func(i int) bool { + { + if field.Name == "age" && col.DataType().ID() == arrow.INT32 { + return func(i int) bool { + val := col.(*array.Int32).Value(i) + return val > 30 + } + } + return func(i int) bool { + return true + } + } +} diff --git a/src/Backend/opti-sql-go/operators/project/csv.go b/src/Backend/opti-sql-go/operators/project/csv.go new file mode 100644 index 0000000..1a021c6 --- /dev/null +++ b/src/Backend/opti-sql-go/operators/project/csv.go @@ -0,0 +1,235 @@ +package project + +import ( + "encoding/csv" + "fmt" + "io" + "opti-sql-go/operators" + "strconv" + "strings" + + "github.com/apache/arrow/go/v15/arrow/memory" + "github.com/apache/arrow/go/v17/arrow" + "github.com/apache/arrow/go/v17/arrow/array" +) + +var ( + _ = (operators.Operator)(&CSVSource{}) +) + +type CSVSource struct { + r *csv.Reader + schema *arrow.Schema // columns to project as well as types to cast to + colPosition map[string]int + firstDataRow []string + done bool // if this is set in Next, we have reached EOF +} + +// assume everything is on disk for now +func NewProjectCSVLeaf(source io.Reader) (*CSVSource, error) { + r := csv.NewReader(source) + proj := &CSVSource{ + r: r, + colPosition: make(map[string]int), + } + var err error + // construct the schema from the header + proj.schema, err = proj.parseHeader() + return proj, err +} + +func (csvS *CSVSource) Next(n uint16) (*operators.RecordBatch, error) { + if csvS.done { + return nil, io.EOF + } + + // 1. Create builders + builders := csvS.initBuilders() + + rowsRead := uint16(0) + + // Process stored first row (from parseHeader) --- + if csvS.firstDataRow != nil && rowsRead < n { + if err := csvS.processRow(csvS.firstDataRow, builders); err != nil { + return nil, err + } + csvS.firstDataRow = nil // consume it once + rowsRead++ + } + + // Stream remaining rows from CSV reader --- + for rowsRead < n { + row, err := csvS.r.Read() + if err == io.EOF { + if rowsRead == 0 { + csvS.done = true + return nil, io.EOF + } + break + } + if err != nil { + return nil, err + } + + // append to builders + if err := csvS.processRow(row, builders); err != nil { + return nil, err + } + + rowsRead++ + } + + // Freeze into Arrow arrays + columns := csvS.finalizeBuilders(builders) + + return &operators.RecordBatch{ + Schema: csvS.schema, + Columns: columns, + RowCount: uint64(rowsRead), + }, nil +} +func (csvS *CSVSource) Close() error { + csvS.r = nil + csvS.done = true + return nil +} + +func (csvS *CSVSource) Schema() *arrow.Schema { + return csvS.schema +} +func (csvS *CSVSource) initBuilders() []array.Builder { + fields := csvS.schema.Fields() + builders := make([]array.Builder, len(fields)) + + for i, f := range fields { + builders[i] = array.NewBuilder(memory.DefaultAllocator, f.Type) + } + + return builders +} +func (csvS *CSVSource) processRow( + content []string, + builders []array.Builder, +) error { + fields := csvS.schema.Fields() + for i, f := range fields { + colIdx := csvS.colPosition[f.Name] + cell := content[colIdx] + + switch b := builders[i].(type) { + + case *array.Int64Builder: + if cell == "" || cell == "NULL" { + b.AppendNull() + } else { + v, err := strconv.ParseInt(cell, 10, 64) + if err != nil { + fmt.Printf("failed to parse cell: %v with error: %v\n", cell, err) + b.AppendNull() + } else { + b.Append(v) + } + } + + case *array.Float64Builder: + if cell == "" || cell == "NULL" { + b.AppendNull() + } else { + v, err := strconv.ParseFloat(cell, 64) + if err != nil { + fmt.Printf("failed to parse cell: %v with error: %v\n", cell, err) + b.AppendNull() + } else { + b.Append(v) + } + } + + case *array.StringBuilder: + if cell == "" || cell == "NULL" { + b.AppendNull() + } else { + b.Append(cell) + } + + case *array.BooleanBuilder: + if cell == "" || cell == "NULL" { + b.AppendNull() + } else { + b.Append(cell == "true") + } + + default: + return fmt.Errorf("unsupported Arrow type: %s", f.Type) + } + } + + return nil +} +func (csvS *CSVSource) finalizeBuilders(builders []array.Builder) []arrow.Array { + columns := make([]arrow.Array, len(builders)) + + for i, b := range builders { + columns[i] = b.NewArray() + b.Release() + } + + return columns +} + +// first call to csv.Reader +func (csvS *CSVSource) parseHeader() (*arrow.Schema, error) { + header, err := csvS.r.Read() + if err != nil { + return nil, err + } + firstDataRow, err := csvS.r.Read() + if err != nil { + return nil, err + } + csvS.firstDataRow = firstDataRow + newFields := make([]arrow.Field, 0, len(header)) + for i, colName := range header { + sampleValue := firstDataRow[i] + newFields = append(newFields, arrow.Field{ + Name: colName, + Type: parseDataType(sampleValue), + Nullable: true, + }) + csvS.colPosition[colName] = i + } + return arrow.NewSchema(newFields, nil), nil +} +func parseDataType(sample string) arrow.DataType { + sample = strings.TrimSpace(sample) + + // Nulls or empty fields → treat as nullable string in inference + if sample == "" || strings.EqualFold(sample, "NULL") { + return arrow.BinaryTypes.String + } + + // Boolean + if sample == "true" || sample == "false" { + return arrow.FixedWidthTypes.Boolean + } + + // Try int + if _, err := strconv.Atoi(sample); err == nil { + return arrow.PrimitiveTypes.Int64 + } + + // Try float + if _, err := strconv.ParseFloat(sample, 64); err == nil { + return arrow.PrimitiveTypes.Float64 + } + + // Fallback to string + return arrow.BinaryTypes.String +} + +/* +Integers (int8, int16, int32, int64) - whole numbers like 42, -100 +Floating point (float32, float64) - decimal numbers like 3.14, -0.5 +Booleans - true/false values (often represented as "true"/"false", "1"/"0", or "yes"/"no") +Strings (text) - any text like "hello", "John Doe" +Nulls +*/ diff --git a/src/Backend/opti-sql-go/operators/project/csv_test.go b/src/Backend/opti-sql-go/operators/project/csv_test.go new file mode 100644 index 0000000..8a119b5 --- /dev/null +++ b/src/Backend/opti-sql-go/operators/project/csv_test.go @@ -0,0 +1,957 @@ +package project + +import ( + "io" + "os" + "strings" + "testing" + + "github.com/apache/arrow/go/v17/arrow" + "github.com/apache/arrow/go/v17/arrow/array" + "github.com/apache/arrow/go/v17/arrow/memory" +) + +const csvFilePath = "../../../test_data/csv/Mental_Health_and_Social_Media_Balance_Dataset.csv" + +func getTestFile() *os.File { + v, err := os.Open(csvFilePath) + if err != nil { + panic(err) + } + return v +} + +func TestCsvInit(t *testing.T) { + v := getTestFile() + defer func() { + if err := v.Close(); err != nil { + t.Fatalf("failed to close: %v", err) + } + }() + p, err := NewProjectCSVLeaf(v) + if err != nil { + t.Errorf("Failed to create ProjectCSVLeaf: %v", err) + } + t.Logf("schema -> %v\n", p.schema) + t.Logf("columns Mapping -> %v\n", p.colPosition) +} +func TestProjectComponents(t *testing.T) { + v := getTestFile() + defer func() { + if err := v.Close(); err != nil { + t.Fatalf("failed to close: %v", err) + } + }() + p, err := NewProjectCSVLeaf(v) + if err != nil { + t.Errorf("Failed to create ProjectCSVLeaf: %v", err) + } + if p.schema == nil { + t.Errorf("Schema is nil") + } + if len(p.colPosition) == 0 { + t.Errorf("Column position mapping is empty") + } +} +func TestCsvNext(t *testing.T) { + v := getTestFile() + defer func() { + if err := v.Close(); err != nil { + t.Fatalf("failed to close: %v", err) + } + }() + + csvLeaf, err := NewProjectCSVLeaf(v) + if err != nil { + t.Errorf("Failed to create ProjectCSVLeaf: %v", err) + } + rBatch, err := csvLeaf.Next(10) + if err != nil { + t.Errorf("Failed to read next batch from CSV: %v", err) + } + t.Logf("Batch: %v\n", rBatch) +} + +// TestParseDataType tests every branch of the parseDataType function +func TestParseDataType(t *testing.T) { + tests := []struct { + name string + input string + expected arrow.DataType + }{ + // Empty and NULL cases + { + name: "Empty string", + input: "", + expected: arrow.BinaryTypes.String, + }, + { + name: "NULL uppercase", + input: "NULL", + expected: arrow.BinaryTypes.String, + }, + { + name: "NULL lowercase", + input: "null", + expected: arrow.BinaryTypes.String, + }, + { + name: "NULL mixed case", + input: "NuLl", + expected: arrow.BinaryTypes.String, + }, + { + name: "Empty string with whitespace", + input: " ", + expected: arrow.BinaryTypes.String, + }, + { + name: "NULL with whitespace", + input: " NULL ", + expected: arrow.BinaryTypes.String, + }, + + // Boolean cases + { + name: "Boolean true", + input: "true", + expected: arrow.FixedWidthTypes.Boolean, + }, + { + name: "Boolean false", + input: "false", + expected: arrow.FixedWidthTypes.Boolean, + }, + { + name: "Boolean true with whitespace", + input: " true ", + expected: arrow.FixedWidthTypes.Boolean, + }, + { + name: "Boolean false with whitespace", + input: " false ", + expected: arrow.FixedWidthTypes.Boolean, + }, + + // Integer cases + { + name: "Positive integer", + input: "123", + expected: arrow.PrimitiveTypes.Int64, + }, + { + name: "Negative integer", + input: "-456", + expected: arrow.PrimitiveTypes.Int64, + }, + { + name: "Zero", + input: "0", + expected: arrow.PrimitiveTypes.Int64, + }, + { + name: "Integer with whitespace", + input: " 789 ", + expected: arrow.PrimitiveTypes.Int64, + }, + + // Float cases + { + name: "Positive float", + input: "3.14", + expected: arrow.PrimitiveTypes.Float64, + }, + { + name: "Negative float", + input: "-2.71", + expected: arrow.PrimitiveTypes.Float64, + }, + { + name: "Float with leading zero", + input: "0.5", + expected: arrow.PrimitiveTypes.Float64, + }, + { + name: "Float with trailing zero", + input: "1.0", + expected: arrow.PrimitiveTypes.Float64, + }, + { + name: "Float with whitespace", + input: " 9.99 ", + expected: arrow.PrimitiveTypes.Float64, + }, + { + name: "Scientific notation", + input: "1.23e10", + expected: arrow.PrimitiveTypes.Float64, + }, + + // String fallback cases + { + name: "Regular string", + input: "hello", + expected: arrow.BinaryTypes.String, + }, + { + name: "String with spaces", + input: "hello world", + expected: arrow.BinaryTypes.String, + }, + { + name: "String with numbers", + input: "abc123", + expected: arrow.BinaryTypes.String, + }, + { + name: "Boolean-like but not exact", + input: "True", + expected: arrow.BinaryTypes.String, + }, + { + name: "Boolean-like but not exact 2", + input: "FALSE", + expected: arrow.BinaryTypes.String, + }, + { + name: "Invalid number", + input: "12.34.56", + expected: arrow.BinaryTypes.String, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := parseDataType(tt.input) + if result != tt.expected { + t.Errorf("parseDataType(%q) = %v, expected %v", tt.input, result, tt.expected) + } + }) + } +} + +// TestParseHeader tests the parseHeader function +func TestParseHeader(t *testing.T) { + t.Run("Valid header with all data types", func(t *testing.T) { + csvData := `id,name,age,salary,active +123,John,30,50000.50,true` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + // Check schema was created + if proj.schema == nil { + t.Fatal("Schema is nil") + } + + // Check correct number of fields + fields := proj.schema.Fields() + if len(fields) != 5 { + t.Errorf("Expected 5 fields, got %d", len(fields)) + } + + // Check field names and types + expectedFields := map[string]arrow.DataType{ + "id": arrow.PrimitiveTypes.Int64, + "name": arrow.BinaryTypes.String, + "age": arrow.PrimitiveTypes.Int64, + "salary": arrow.PrimitiveTypes.Float64, + "active": arrow.FixedWidthTypes.Boolean, + } + + for _, field := range fields { + expectedType, exists := expectedFields[field.Name] + if !exists { + t.Errorf("Unexpected field name: %s", field.Name) + continue + } + if field.Type != expectedType { + t.Errorf("Field %s: expected type %v, got %v", field.Name, expectedType, field.Type) + } + if !field.Nullable { + t.Errorf("Field %s: expected nullable=true, got false", field.Name) + } + } + + // Check column position mapping + if len(proj.colPosition) != 5 { + t.Errorf("Expected 5 column positions, got %d", len(proj.colPosition)) + } + + expectedPositions := map[string]int{ + "id": 0, + "name": 1, + "age": 2, + "salary": 3, + "active": 4, + } + + for name, expectedPos := range expectedPositions { + actualPos, exists := proj.colPosition[name] + if !exists { + t.Errorf("Column position for %s not found", name) + continue + } + if actualPos != expectedPos { + t.Errorf("Column %s: expected position %d, got %d", name, expectedPos, actualPos) + } + } + }) + + t.Run("Header with NULL values", func(t *testing.T) { + csvData := `col1,col2,col3 +NULL,,value` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + fields := proj.schema.Fields() + // All should be inferred as string + for _, field := range fields { + if field.Type != arrow.BinaryTypes.String { + t.Errorf("Field %s: expected String type for NULL/empty value, got %v", field.Name, field.Type) + } + } + }) + + t.Run("Empty file - header only", func(t *testing.T) { + csvData := `col1,col2` + reader := strings.NewReader(csvData) + _, err := NewProjectCSVLeaf(reader) + if err == nil { + t.Error("Expected error for CSV with header but no data rows") + } + }) + + t.Run("Completely empty file", func(t *testing.T) { + csvData := `` + reader := strings.NewReader(csvData) + _, err := NewProjectCSVLeaf(reader) + if err == nil { + t.Error("Expected error for completely empty CSV") + } + }) +} + +// TestNewProjectCSVLeaf tests the constructor +func TestNewProjectCSVLeaf(t *testing.T) { + t.Run("Valid CSV initialization", func(t *testing.T) { + csvData := `name,value +test,123` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + if proj == nil { + t.Fatal("ProjectCSVLeaf is nil") + } + if proj.r == nil { + t.Error("CSV reader is nil") + } + if proj.schema == nil { + t.Error("Schema is nil") + } + if proj.colPosition == nil { + t.Error("Column position map is nil") + } + if proj.done { + t.Error("done flag should be false initially") + } + }) + + t.Run("Error during header parsing", func(t *testing.T) { + csvData := `only_header` + reader := strings.NewReader(csvData) + _, err := NewProjectCSVLeaf(reader) + if err == nil { + t.Error("Expected error when no data rows present") + } + }) +} + +// TestNextFunction tests the Next function comprehensively +func TestNextFunction(t *testing.T) { + t.Run("Read single batch with all data types", func(t *testing.T) { + csvData := `id,name,score,active +1,Alice,95.5,true +2,Bob,87.3,false +3,Charlie,92.1,true` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + batch, err := proj.Next(10) + if err != nil { + t.Fatalf("Next failed: %v", err) + } + + if batch == nil { + t.Fatal("Batch is nil") + } + + // Check schema + if batch.Schema == nil { + t.Fatal("Batch schema is nil") + } + + // Check columns + if len(batch.Columns) != 4 { + t.Fatalf("Expected 4 columns, got %d", len(batch.Columns)) + } + + // Verify each column has 3 rows + for i, col := range batch.Columns { + if col.Len() != 3 { + t.Errorf("Column %d: expected 3 rows, got %d", i, col.Len()) + } + } + t.Logf("col0: %v\n", batch.Columns[0]) + // Check Int64 column (id) + idCol, ok := batch.Columns[0].(*array.Int64) + if !ok { + t.Errorf("Column 0 (id): expected *array.Int64, got %T", batch.Columns[0]) + } else { + if idCol.Value(0) != 1 || idCol.Value(1) != 2 || idCol.Value(2) != 3 { + t.Errorf("ID column values incorrect: got [%d, %d, %d]", idCol.Value(0), idCol.Value(1), idCol.Value(2)) + } + } + + // Check String column (name) + nameCol, ok := batch.Columns[1].(*array.String) + if !ok { + t.Errorf("Column 1 (name): expected *array.String, got %T", batch.Columns[1]) + } else { + if nameCol.Value(0) != "Alice" || nameCol.Value(1) != "Bob" || nameCol.Value(2) != "Charlie" { + t.Errorf("Name column values incorrect") + } + } + + // Check Float64 column (score) + scoreCol, ok := batch.Columns[2].(*array.Float64) + if !ok { + t.Errorf("Column 2 (score): expected *array.Float64, got %T", batch.Columns[2]) + } else { + if scoreCol.Value(0) != 95.5 || scoreCol.Value(1) != 87.3 || scoreCol.Value(2) != 92.1 { + t.Errorf("Score column values incorrect") + } + } + + // Check Boolean column (active) + activeCol, ok := batch.Columns[3].(*array.Boolean) + if !ok { + t.Errorf("Column 3 (active): expected *array.Boolean, got %T", batch.Columns[3]) + } else { + if !activeCol.Value(0) || activeCol.Value(1) || !activeCol.Value(2) { + t.Errorf("Active column values incorrect") + } + } + }) + + t.Run("Read with NULL values - Int64", func(t *testing.T) { + csvData := `id,value +1,100 +,200 +3,` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + batch, err := proj.Next(10) + if err != nil { + t.Fatalf("Next failed: %v", err) + } + + // Check id column for NULLs + idCol, ok := batch.Columns[0].(*array.Int64) + if !ok { + t.Fatalf("Column 0: expected *array.Int64, got %T", batch.Columns[0]) + } + + if !idCol.IsNull(1) { + t.Error("Expected NULL at index 1 in id column") + } + if idCol.IsNull(0) || idCol.IsNull(2) { + t.Error("Unexpected NULL in id column") + } + + // Check value column for NULLs + valueCol, ok := batch.Columns[1].(*array.Int64) + if !ok { + t.Fatalf("Column 1: expected *array.Int64, got %T", batch.Columns[1]) + } + + if !valueCol.IsNull(2) { + t.Error("Expected NULL at index 2 in value column") + } + }) + + t.Run("Read with NULL values - Float64", func(t *testing.T) { + csvData := `price +99.99 +NULL +` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + batch, err := proj.Next(10) + if err != nil { + t.Fatalf("Next failed: %v", err) + } + + priceCol, ok := batch.Columns[0].(*array.Float64) + if !ok { + t.Fatalf("Expected *array.Float64, got %T", batch.Columns[0]) + } + + if !priceCol.IsNull(1) || !priceCol.IsNull(2) { + t.Error("Expected NULL values in price column") + } + }) + + t.Run("Read with NULL values - String", func(t *testing.T) { + csvData := `name +Alice +NULL +` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + batch, err := proj.Next(10) + if err != nil { + t.Fatalf("Next failed: %v", err) + } + + nameCol, ok := batch.Columns[0].(*array.String) + if !ok { + t.Fatalf("Expected *array.String, got %T", batch.Columns[0]) + } + + if !nameCol.IsNull(1) || !nameCol.IsNull(2) { + t.Error("Expected NULL values in name column") + } + }) + + t.Run("Read with NULL values - Boolean", func(t *testing.T) { + csvData := `flag +true +NULL +false +` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + batch, err := proj.Next(10) + if err != nil { + t.Fatalf("Next failed: %v", err) + } + + flagCol, ok := batch.Columns[0].(*array.Boolean) + if !ok { + t.Fatalf("Expected *array.Boolean, got %T", batch.Columns[0]) + } + t.Logf("flagCol : %v\n", flagCol) + + if !flagCol.IsNull(1) { + t.Error("Expected NULL values in flag column") + } + }) + + t.Run("Read multiple batches", func(t *testing.T) { + csvData := `id +1 +2 +3 +4 +5 +6` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + // First batch of 2 + batch1, err := proj.Next(2) + if err != nil { + t.Fatalf("First Next failed: %v", err) + } + if batch1.Columns[0].Len() != 2 { + t.Errorf("First batch: expected 2 rows, got %d", batch1.Columns[0].Len()) + } + + // Second batch of 3 + batch2, err := proj.Next(3) + if err != nil { + t.Fatalf("Second Next failed: %v", err) + } + if batch2.Columns[0].Len() != 3 { + t.Errorf("Second batch: expected 3 rows, got %d", batch2.Columns[0].Len()) + } + + // Third batch - should get remaining 1 row + batch3, err := proj.Next(10) + if err != nil { + t.Fatalf("Third Next failed: %v", err) + } + if batch3.Columns[0].Len() != 1 { + t.Errorf("Third batch: expected 1 row, got %d", batch3.Columns[0].Len()) + } + + // Fourth batch - should return EOF + _, err = proj.Next(10) + if err != io.EOF { + t.Errorf("Expected EOF, got: %v", err) + } + }) + + t.Run("Read exact batch size", func(t *testing.T) { + csvData := `num +10 +20 +30` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + batch, err := proj.Next(3) + if err != nil { + t.Fatalf("Next failed: %v", err) + } + if batch.Columns[0].Len() != 3 { + t.Errorf("Expected 3 rows, got %d", batch.Columns[0].Len()) + } + + // Next call should return EOF + _, err = proj.Next(1) + if err != io.EOF { + t.Errorf("Expected EOF after reading all data, got: %v", err) + } + }) + + t.Run("EOF on first Next call - empty data", func(t *testing.T) { + csvData := `col1 +val1` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + // Read the only row + _, err = proj.Next(10) + if err != nil { + t.Fatalf("First Next failed: %v", err) + } + + // Second call when no data remains and rowsRead == 0 + _, err = proj.Next(10) + if err != io.EOF { + t.Errorf("Expected EOF when no data left, got: %v", err) + } + + // Verify done flag is set + if !proj.done { + t.Error("Expected done flag to be true after EOF") + } + }) + + t.Run("Subsequent calls after done is set", func(t *testing.T) { + csvData := `val +1` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + // Read all data + _, _ = proj.Next(10) + + // Hit EOF and set done + _, err = proj.Next(10) + if err != io.EOF { + t.Fatalf("Expected EOF, got: %v", err) + } + + // Call again - should immediately return EOF due to done flag + _, err = proj.Next(10) + if err != io.EOF { + t.Errorf("Expected EOF on subsequent call when done=true, got: %v", err) + } + }) + + t.Run("Batch size of 1", func(t *testing.T) { + csvData := `x +a +b +c` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + // Read one row at a time + for i := 0; i < 3; i++ { + batch, err := proj.Next(1) + if err != nil { + t.Fatalf("Next call %d failed: %v", i+1, err) + } + if batch.Columns[0].Len() != 1 { + t.Errorf("Batch %d: expected 1 row, got %d", i+1, batch.Columns[0].Len()) + } + } + }) + + t.Run("Large batch size with fewer rows", func(t *testing.T) { + csvData := `num +1 +2` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + batch, err := proj.Next(1000) + if err != nil { + t.Fatalf("Next failed: %v", err) + } + if batch.Columns[0].Len() != 2 { + t.Errorf("Expected 2 rows, got %d", batch.Columns[0].Len()) + } + }) + + t.Run("EOF mid-batch breaks correctly", func(t *testing.T) { + csvData := `id +1 +2 +3` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + // Request 10 rows, but only 3 exist + batch, err := proj.Next(10) + if err != nil { + t.Fatalf("Next failed: %v", err) + } + + // Should get 3 rows (break on EOF, not error) + if batch.Columns[0].Len() != 3 { + t.Errorf("Expected 3 rows when hitting EOF mid-batch, got %d", batch.Columns[0].Len()) + } + }) + + t.Run("Boolean false value handling", func(t *testing.T) { + csvData := `active +false +true +false` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + batch, err := proj.Next(10) + if err != nil { + t.Fatalf("Next failed: %v", err) + } + + boolCol, ok := batch.Columns[0].(*array.Boolean) + if !ok { + t.Fatalf("Expected *array.Boolean, got %T", batch.Columns[0]) + } + + // Verify false values are correctly stored + if boolCol.Value(0) != false { + t.Error("Expected false at index 0") + } + if boolCol.Value(1) != true { + t.Error("Expected true at index 1") + } + if boolCol.Value(2) != false { + t.Error("Expected false at index 2") + } + }) + + t.Run("Column ordering matches schema", func(t *testing.T) { + csvData := `z,y,x +1,2,3 +4,5,6` + reader := strings.NewReader(csvData) + proj, err := NewProjectCSVLeaf(reader) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + batch, err := proj.Next(10) + if err != nil { + t.Fatalf("Next failed: %v", err) + } + + // Verify schema field order + fields := batch.Schema.Fields() + if fields[0].Name != "z" || fields[1].Name != "y" || fields[2].Name != "x" { + t.Error("Schema field order doesn't match CSV header order") + } + + // Verify data is in correct columns + zCol := batch.Columns[0].(*array.Int64) + yCol := batch.Columns[1].(*array.Int64) + xCol := batch.Columns[2].(*array.Int64) + + if zCol.Value(0) != 1 || yCol.Value(0) != 2 || xCol.Value(0) != 3 { + t.Error("First row data not in correct column order") + } + if zCol.Value(1) != 4 || yCol.Value(1) != 5 || xCol.Value(1) != 6 { + t.Error("Second row data not in correct column order") + } + }) +} + +// TestIntegrationWithRealFile tests with the actual test file +func TestIntegrationWithRealFile(t *testing.T) { + t.Run("Real file - multiple batches", func(t *testing.T) { + v := getTestFile() + defer func() { + if err := v.Close(); err != nil { + t.Fatalf("failed to close: %v", err) + } + }() + + proj, err := NewProjectCSVLeaf(v) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + totalRows := 0 + batchCount := 0 + + for { + batch, err := proj.Next(10) + if err == io.EOF { + break + } + if err != nil { + t.Fatalf("Next failed on batch %d: %v", batchCount+1, err) + } + + batchCount++ + if len(batch.Columns) > 0 { + totalRows += batch.Columns[0].Len() + } + + // Verify all columns have same length + expectedLen := batch.Columns[0].Len() + for i, col := range batch.Columns { + if col.Len() != expectedLen { + t.Errorf("Batch %d, Column %d: length mismatch, expected %d, got %d", + batchCount, i, expectedLen, col.Len()) + } + } + } + + if batchCount == 0 { + t.Error("Expected at least one batch from real file") + } + if totalRows == 0 { + t.Error("Expected at least one row from real file") + } + + t.Logf("Read %d batches with total of %d rows", batchCount, totalRows) + }) + + t.Run("Real file - schema validation", func(t *testing.T) { + v := getTestFile() + defer func() { + if err := v.Close(); err != nil { + t.Fatalf("failed to close: %v", err) + } + }() + + proj, err := NewProjectCSVLeaf(v) + if err != nil { + t.Fatalf("NewProjectCSVLeaf failed: %v", err) + } + + if proj.schema == nil { + t.Fatal("Schema is nil") + } + + fields := proj.schema.Fields() + if len(fields) == 0 { + t.Error("Schema has no fields") + } + + // Verify all fields are nullable + for _, field := range fields { + if !field.Nullable { + t.Errorf("Field %s is not nullable", field.Name) + } + } + + // Verify colPosition map matches schema + if len(proj.colPosition) != len(fields) { + t.Errorf("Column position map size (%d) doesn't match schema field count (%d)", + len(proj.colPosition), len(fields)) + } + + for i, field := range fields { + pos, exists := proj.colPosition[field.Name] + if !exists { + t.Errorf("Field %s not found in column position map", field.Name) + } + if pos != i { + t.Errorf("Field %s: expected position %d, got %d", field.Name, i, pos) + } + } + }) +} +func TestProccessFirstLine(t *testing.T) { + v := getTestFile() + p, err := NewProjectCSVLeaf(v) + if err != nil { + t.Errorf("Failed to create ProjectCSVLeaf: %v", err) + } + defer func() { + if err := v.Close(); err != nil { + t.Fatalf("failed to close: %v", err) + } + }() + var builders []array.Builder + for range len(p.schema.Fields()) { + builder := array.NewBuilder(memory.DefaultAllocator, &arrow.Date64Type{}) + defer builder.Release() + builders = append(builders, builder) + } + err = p.processRow([]string{"1", "alice", "95.5", "true"}, builders) + if err == nil { + t.Errorf("Expected error for empty row, got nil") + } + +} diff --git a/src/Backend/opti-sql-go/operators/project/custom.go b/src/Backend/opti-sql-go/operators/project/custom.go new file mode 100644 index 0000000..38f2a94 --- /dev/null +++ b/src/Backend/opti-sql-go/operators/project/custom.go @@ -0,0 +1,236 @@ +package project + +import ( + "fmt" + "io" + "opti-sql-go/operators" + + "github.com/apache/arrow/go/v15/arrow/memory" + "github.com/apache/arrow/go/v17/arrow" + "github.com/apache/arrow/go/v17/arrow/array" +) + +var ( + _ = (operators.Operator)(&InMemorySource{}) +) + +// in memory format just for the ease of testing +// same as other sources, we can use structs/slices here + +// thankfully we already covered most of this in record.go +// add a couple utility functions for go types and this should be good to go +var ( + ErrInvalidInMemoryDataType = func(Type any) error { + return fmt.Errorf("%T is not a supported in memory dataType for InMemoryProjectExec", Type) + } +) + +type InMemorySource struct { + schema *arrow.Schema + columns []arrow.Array + pos uint16 + fieldToColIDx map[string]int +} + +func NewInMemoryProjectExec(names []string, columns []any) (*InMemorySource, error) { + if len(names) != len(columns) { + return nil, operators.ErrInvalidSchema("number of column names and columns do not match") + } + fields := make([]arrow.Field, 0, len(names)) + arrays := make([]arrow.Array, 0, len(names)) + fieldToColIDx := make(map[string]int) + // parse schema from each of the columns + for i, col := range columns { + if !supportedType(col) { + return nil, operators.ErrInvalidSchema(fmt.Sprintf("unsupported column type for column %s", names[i])) + } + field, arr, err := unpackColumn(names[i], col) + if err != nil { + return nil, ErrInvalidInMemoryDataType(col) + } + fields = append(fields, field) + arrays = append(arrays, arr) + fieldToColIDx[field.Name] = i + } + return &InMemorySource{ + schema: arrow.NewSchema(fields, nil), + columns: arrays, + fieldToColIDx: fieldToColIDx, + }, nil +} +func (ms *InMemorySource) withFields(names ...string) error { + + newSchema, cols, err := ProjectSchemaFilterDown(ms.schema, ms.columns, names...) + if err != nil { + return err + } + newMap := make(map[string]int) + for i, f := range newSchema.Fields() { + newMap[f.Name] = i + } + ms.schema = newSchema + ms.fieldToColIDx = newMap + ms.columns = cols + return nil +} +func (ms *InMemorySource) Next(n uint16) (*operators.RecordBatch, error) { + if len(ms.columns) == 0 || ms.pos >= uint16(ms.columns[0].Len()) { + return nil, io.EOF // EOF + } + var currRows uint16 = 0 + outPutCols := make([]arrow.Array, len(ms.schema.Fields())) + + for i, field := range ms.schema.Fields() { + col := ms.columns[ms.fieldToColIDx[field.Name]] + colLen := uint16(col.Len()) + remaining := colLen - ms.pos + toRead := n + if remaining < n { + toRead = remaining + } + slice := array.NewSlice(col, int64(ms.pos), int64(ms.pos+toRead)) + outPutCols[i] = slice + currRows = toRead + } + ms.pos += currRows + + return &operators.RecordBatch{ + Schema: ms.schema, + Columns: outPutCols, + RowCount: uint64(currRows), + }, nil +} +func (ms *InMemorySource) Close() error { + for _, c := range ms.columns { + c.Release() + } + return nil +} +func (ms *InMemorySource) Schema() *arrow.Schema { + return ms.schema +} +func unpackColumn(name string, col any) (arrow.Field, arrow.Array, error) { + // need to not only build the array; but also need the schema + var field arrow.Field + field.Name = name + field.Nullable = true // default to nullable + switch colType := col.(type) { + case []int: + field.Type = arrow.PrimitiveTypes.Int64 + data := colType + b := array.NewInt64Builder(memory.DefaultAllocator) + defer b.Release() + for _, v := range data { + b.Append(int64(v)) + } + return field, b.NewArray(), nil + case []int8: + field.Type = arrow.PrimitiveTypes.Int8 + data := colType + b := array.NewInt8Builder(memory.DefaultAllocator) + defer b.Release() + b.AppendValues(data, nil) + return field, b.NewArray(), nil + case []int16: + field.Type = arrow.PrimitiveTypes.Int16 + data := colType + b := array.NewInt16Builder(memory.DefaultAllocator) + defer b.Release() + b.AppendValues(data, nil) + return field, b.NewArray(), nil + case []int32: + field.Type = arrow.PrimitiveTypes.Int32 + data := colType + b := array.NewInt32Builder(memory.DefaultAllocator) + defer b.Release() + b.AppendValues(data, nil) + return field, b.NewArray(), nil + // build int32 array + case []int64: + field.Type = arrow.PrimitiveTypes.Int64 + data := colType + b := array.NewInt64Builder(memory.DefaultAllocator) + defer b.Release() + b.AppendValues(data, nil) + return field, b.NewArray(), nil + case []uint: + field.Type = arrow.PrimitiveTypes.Uint64 + data := colType + b := array.NewUint64Builder(memory.DefaultAllocator) + defer b.Release() + for _, v := range data { + b.Append(uint64(v)) + } + return field, b.NewArray(), nil + case []uint8: + field.Type = arrow.PrimitiveTypes.Uint8 + data := colType + b := array.NewUint8Builder(memory.DefaultAllocator) + defer b.Release() + b.AppendValues(data, nil) + return field, b.NewArray(), nil + case []uint16: + field.Type = arrow.PrimitiveTypes.Uint16 + data := colType + b := array.NewUint16Builder(memory.DefaultAllocator) + defer b.Release() + b.AppendValues(data, nil) + return field, b.NewArray(), nil + case []uint32: + field.Type = arrow.PrimitiveTypes.Uint32 + data := colType + b := array.NewUint32Builder(memory.DefaultAllocator) + defer b.Release() + b.AppendValues(data, nil) + return field, b.NewArray(), nil + case []uint64: + field.Type = arrow.PrimitiveTypes.Uint64 + data := colType + b := array.NewUint64Builder(memory.DefaultAllocator) + defer b.Release() + b.AppendValues(data, nil) + return field, b.NewArray(), nil + case []float32: + field.Type = arrow.PrimitiveTypes.Float32 + data := colType + b := array.NewFloat32Builder(memory.DefaultAllocator) + defer b.Release() + b.AppendValues(data, nil) + return field, b.NewArray(), nil + case []float64: + field.Type = arrow.PrimitiveTypes.Float64 + data := colType + b := array.NewFloat64Builder(memory.DefaultAllocator) + defer b.Release() + b.AppendValues(data, nil) + return field, b.NewArray(), nil + case []string: + field.Type = arrow.BinaryTypes.String + data := colType + b := array.NewStringBuilder(memory.DefaultAllocator) + defer b.Release() + b.AppendValues(data, nil) + return field, b.NewArray(), nil + case []bool: + field.Type = arrow.FixedWidthTypes.Boolean + data := colType + b := array.NewBooleanBuilder(memory.DefaultAllocator) + defer b.Release() + b.AppendValues(data, nil) + return field, b.NewArray(), nil + + } + return arrow.Field{}, nil, fmt.Errorf("unsupported column type for column %s", name) +} +func supportedType(col any) bool { + switch col.(type) { + case []int, []int8, []int16, []int32, []int64, + []uint, []uint8, []uint16, []uint32, []uint64, + []float32, []float64, + []string, + []bool: + return true + default: + return false + } +} diff --git a/src/Backend/opti-sql-go/operators/project/custom_test.go b/src/Backend/opti-sql-go/operators/project/custom_test.go new file mode 100644 index 0000000..ef08946 --- /dev/null +++ b/src/Backend/opti-sql-go/operators/project/custom_test.go @@ -0,0 +1,923 @@ +package project + +import ( + "io" + "testing" + + "github.com/apache/arrow/go/v17/arrow" + "github.com/apache/arrow/go/v17/arrow/array" +) + +// generateTestColumns returns 8 column names and matching columns, +// each column containing ~10 entries for testing purposes. +func generateTestColumns() ([]string, []any) { + names := []string{ + "id", + "name", + "age", + "salary", + "is_active", + "department", + "rating", + "years_experience", + } + + columns := []any{ + []int32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + []string{ + "Alice", "Bob", "Charlie", "David", "Eve", + "Frank", "Grace", "Hannah", "Ivy", "Jake", + }, + []int32{28, 34, 45, 22, 31, 29, 40, 36, 50, 26}, + []float64{ + 70000.0, 82000.5, 54000.0, 91000.0, 60000.0, + 75000.0, 66000.0, 88000.0, 45000.0, 99000.0, + }, + []bool{true, false, true, true, false, false, true, true, false, true}, + []string{ + "Engineering", "HR", "Engineering", "Sales", "Finance", + "Sales", "Support", "Engineering", "HR", "Finance", + }, + []float32{4.5, 3.8, 4.2, 2.9, 5.0, 4.3, 3.7, 4.9, 4.1, 3.5}, + []int32{1, 5, 10, 2, 7, 3, 6, 12, 4, 8}, + } + + return names, columns +} + +func TestInMemoryBatchInit(t *testing.T) { + // Simple passing test + names := []string{"id", "name", "age", "salary", "is_active"} + columns := []any{ + []int32{1, 2, 3, 4, 5}, + []string{"Alice", "Bob", "Charlie", "David", "Eve"}, + []int32{30, 25, 35, 28, 40}, + []float64{70000.0, 50000.0, 80000.0, 60000.0, 90000.0}, + []bool{true, false, true, true, false}, + } + projC, err := NewInMemoryProjectExec(names, columns) + if err != nil { + t.Errorf("Failed to create InMemoryProjectExec: %v", err) + } + if projC.schema == nil { + t.Error("Schema is nil") + } + if projC.columns == nil { + t.Error("Columns are nil") + } + if projC.schema.NumFields() != len(names) { + t.Errorf("Schema field count mismatch: got %d, want %d", projC.schema.NumFields(), len(names)) + } + if len(projC.columns) != len(columns) { + t.Errorf("Columns count mismatch: got %d, want %d", len(projC.columns), len(columns)) + } + if len(projC.columns) != projC.schema.NumFields() { + t.Errorf("Columns and schema field count mismatch: got %d and %d", len(projC.columns), projC.schema.NumFields()) + } + t.Logf("schema: %v\n", projC.schema) +} + +// ==================== COMPREHENSIVE TESTS FOR 100% CODE COVERAGE ==================== + +// TestSupportedType tests every branch of the supportedType function +func TestSupportedType(t *testing.T) { + tests := []struct { + name string + input any + expected bool + }{ + // Supported integer types + {"[]int", []int{1, 2, 3}, true}, + {"[]int8", []int8{1, 2, 3}, true}, + {"[]int16", []int16{1, 2, 3}, true}, + {"[]int32", []int32{1, 2, 3}, true}, + {"[]int64", []int64{1, 2, 3}, true}, + + // Supported unsigned integer types + {"[]uint", []uint{1, 2, 3}, true}, + {"[]uint8", []uint8{1, 2, 3}, true}, + {"[]uint16", []uint16{1, 2, 3}, true}, + {"[]uint32", []uint32{1, 2, 3}, true}, + {"[]uint64", []uint64{1, 2, 3}, true}, + + // Supported float types + {"[]float32", []float32{1.1, 2.2, 3.3}, true}, + {"[]float64", []float64{1.1, 2.2, 3.3}, true}, + + // Supported string type + {"[]string", []string{"a", "b", "c"}, true}, + + // Supported boolean type + {"[]bool", []bool{true, false, true}, true}, + + // Unsupported types + //{"[]byte", []byte{1, 2, 3}, false}, alias for uint8 + //{"[]rune", []rune{'a', 'b', 'c'}, false}, alias for int32 + {"[]interface{}", []interface{}{1, "a", true}, false}, + {"map[string]int", map[string]int{"a": 1}, false}, + {"string", "not a slice", false}, + {"int", 123, false}, + {"struct", struct{ x int }{x: 1}, false}, + {"nil", nil, false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := supportedType(tt.input) + if result != tt.expected { + t.Errorf("supportedType(%v) = %v, expected %v", tt.name, result, tt.expected) + } + }) + } +} + +// TestUnpackColumn tests every branch of the unpackColumm function +func TestUnpackColumn(t *testing.T) { + t.Run("[]int type", func(t *testing.T) { + field, arr, err := unpackColumn("test_int", []int{1, 2, 3, 4, 5}) + if err != nil { + t.Fatalf("unpackColumm failed: %v", err) + } + if field.Name != "test_int" { + t.Errorf("Expected field name 'test_int', got '%s'", field.Name) + } + if field.Type != arrow.PrimitiveTypes.Int64 { + t.Errorf("Expected Int64 type, got %v", field.Type) + } + if !field.Nullable { + t.Error("Expected field to be nullable") + } + int64Arr, ok := arr.(*array.Int64) + if !ok { + t.Fatalf("Expected *array.Int64, got %T", arr) + } + if int64Arr.Len() != 5 { + t.Errorf("Expected 5 elements, got %d", int64Arr.Len()) + } + for i := 0; i < 5; i++ { + if int64Arr.Value(i) != int64(i+1) { + t.Errorf("Element %d: expected %d, got %d", i, i+1, int64Arr.Value(i)) + } + } + }) + + t.Run("[]int8 type", func(t *testing.T) { + field, arr, err := unpackColumn("test_int8", []int8{-1, 0, 1, 127}) + if err != nil { + t.Fatalf("unpackColumm failed: %v", err) + } + if field.Type != arrow.PrimitiveTypes.Int8 { + t.Errorf("Expected Int8 type, got %v", field.Type) + } + int8Arr, ok := arr.(*array.Int8) + if !ok { + t.Fatalf("Expected *array.Int8, got %T", arr) + } + if int8Arr.Len() != 4 { + t.Errorf("Expected 4 elements, got %d", int8Arr.Len()) + } + }) + + t.Run("[]int16 type", func(t *testing.T) { + field, arr, err := unpackColumn("test_int16", []int16{-100, 0, 100, 32767}) + if err != nil { + t.Fatalf("unpackColumm failed: %v", err) + } + if field.Type != arrow.PrimitiveTypes.Int16 { + t.Errorf("Expected Int16 type, got %v", field.Type) + } + int16Arr, ok := arr.(*array.Int16) + if !ok { + t.Fatalf("Expected *array.Int16, got %T", arr) + } + if int16Arr.Len() != 4 { + t.Errorf("Expected 4 elements, got %d", int16Arr.Len()) + } + }) + + t.Run("[]int32 type", func(t *testing.T) { + field, arr, err := unpackColumn("test_int32", []int32{-1000, 0, 1000, 2147483647}) + if err != nil { + t.Fatalf("unpackColumm failed: %v", err) + } + if field.Type != arrow.PrimitiveTypes.Int32 { + t.Errorf("Expected Int32 type, got %v", field.Type) + } + int32Arr, ok := arr.(*array.Int32) + if !ok { + t.Fatalf("Expected *array.Int32, got %T", arr) + } + if int32Arr.Len() != 4 { + t.Errorf("Expected 4 elements, got %d", int32Arr.Len()) + } + }) + + t.Run("[]int64 type", func(t *testing.T) { + field, arr, err := unpackColumn("test_int64", []int64{-9223372036854775808, 0, 9223372036854775807}) + if err != nil { + t.Fatalf("unpackColumm failed: %v", err) + } + if field.Type != arrow.PrimitiveTypes.Int64 { + t.Errorf("Expected Int64 type, got %v", field.Type) + } + int64Arr, ok := arr.(*array.Int64) + if !ok { + t.Fatalf("Expected *array.Int64, got %T", arr) + } + if int64Arr.Len() != 3 { + t.Errorf("Expected 3 elements, got %d", int64Arr.Len()) + } + }) + + t.Run("[]uint type", func(t *testing.T) { + field, arr, err := unpackColumn("test_uint", []uint{0, 1, 100, 1000}) + if err != nil { + t.Fatalf("unpackColumm failed: %v", err) + } + if field.Type != arrow.PrimitiveTypes.Uint64 { + t.Errorf("Expected Uint64 type, got %v", field.Type) + } + uint64Arr, ok := arr.(*array.Uint64) + if !ok { + t.Fatalf("Expected *array.Uint64, got %T", arr) + } + if uint64Arr.Len() != 4 { + t.Errorf("Expected 4 elements, got %d", uint64Arr.Len()) + } + expected := []uint64{0, 1, 100, 1000} + for i, exp := range expected { + if uint64Arr.Value(i) != exp { + t.Errorf("Element %d: expected %d, got %d", i, exp, uint64Arr.Value(i)) + } + } + }) + + t.Run("[]uint8 type", func(t *testing.T) { + field, arr, err := unpackColumn("test_uint8", []uint8{0, 1, 255}) + if err != nil { + t.Fatalf("unpackColumm failed: %v", err) + } + if field.Type != arrow.PrimitiveTypes.Uint8 { + t.Errorf("Expected Uint8 type, got %v", field.Type) + } + uint8Arr, ok := arr.(*array.Uint8) + if !ok { + t.Fatalf("Expected *array.Uint8, got %T", arr) + } + if uint8Arr.Len() != 3 { + t.Errorf("Expected 3 elements, got %d", uint8Arr.Len()) + } + }) + + t.Run("[]uint16 type", func(t *testing.T) { + field, arr, err := unpackColumn("test_uint16", []uint16{0, 100, 65535}) + if err != nil { + t.Fatalf("unpackColumm failed: %v", err) + } + if field.Type != arrow.PrimitiveTypes.Uint16 { + t.Errorf("Expected Uint16 type, got %v", field.Type) + } + uint16Arr, ok := arr.(*array.Uint16) + if !ok { + t.Fatalf("Expected *array.Uint16, got %T", arr) + } + if uint16Arr.Len() != 3 { + t.Errorf("Expected 3 elements, got %d", uint16Arr.Len()) + } + }) + + t.Run("[]uint32 type", func(t *testing.T) { + field, arr, err := unpackColumn("test_uint32", []uint32{0, 1000, 4294967295}) + if err != nil { + t.Fatalf("unpackColumm failed: %v", err) + } + if field.Type != arrow.PrimitiveTypes.Uint32 { + t.Errorf("Expected Uint32 type, got %v", field.Type) + } + uint32Arr, ok := arr.(*array.Uint32) + if !ok { + t.Fatalf("Expected *array.Uint32, got %T", arr) + } + if uint32Arr.Len() != 3 { + t.Errorf("Expected 3 elements, got %d", uint32Arr.Len()) + } + }) + + t.Run("[]uint64 type", func(t *testing.T) { + field, arr, err := unpackColumn("test_uint64", []uint64{0, 1000, 18446744073709551615}) + if err != nil { + t.Fatalf("unpackColumm failed: %v", err) + } + if field.Type != arrow.PrimitiveTypes.Uint64 { + t.Errorf("Expected Uint64 type, got %v", field.Type) + } + uint64Arr, ok := arr.(*array.Uint64) + if !ok { + t.Fatalf("Expected *array.Uint64, got %T", arr) + } + if uint64Arr.Len() != 3 { + t.Errorf("Expected 3 elements, got %d", uint64Arr.Len()) + } + }) + + t.Run("[]float32 type", func(t *testing.T) { + field, arr, err := unpackColumn("test_float32", []float32{-1.5, 0.0, 1.5, 3.14159}) + if err != nil { + t.Fatalf("unpackColumm failed: %v", err) + } + if field.Type != arrow.PrimitiveTypes.Float32 { + t.Errorf("Expected Float32 type, got %v", field.Type) + } + float32Arr, ok := arr.(*array.Float32) + if !ok { + t.Fatalf("Expected *array.Float32, got %T", arr) + } + if float32Arr.Len() != 4 { + t.Errorf("Expected 4 elements, got %d", float32Arr.Len()) + } + }) + + t.Run("[]float64 type", func(t *testing.T) { + field, arr, err := unpackColumn("test_float64", []float64{-2.718281828, 0.0, 3.141592653589793}) + if err != nil { + t.Fatalf("unpackColumm failed: %v", err) + } + if field.Type != arrow.PrimitiveTypes.Float64 { + t.Errorf("Expected Float64 type, got %v", field.Type) + } + float64Arr, ok := arr.(*array.Float64) + if !ok { + t.Fatalf("Expected *array.Float64, got %T", arr) + } + if float64Arr.Len() != 3 { + t.Errorf("Expected 3 elements, got %d", float64Arr.Len()) + } + }) + + t.Run("[]string type", func(t *testing.T) { + field, arr, err := unpackColumn("test_string", []string{"hello", "world", "test", ""}) + if err != nil { + t.Fatalf("unpackColumm failed: %v", err) + } + if field.Type != arrow.BinaryTypes.String { + t.Errorf("Expected String type, got %v", field.Type) + } + stringArr, ok := arr.(*array.String) + if !ok { + t.Fatalf("Expected *array.String, got %T", arr) + } + if stringArr.Len() != 4 { + t.Errorf("Expected 4 elements, got %d", stringArr.Len()) + } + expected := []string{"hello", "world", "test", ""} + for i, exp := range expected { + if stringArr.Value(i) != exp { + t.Errorf("Element %d: expected '%s', got '%s'", i, exp, stringArr.Value(i)) + } + } + }) + + t.Run("[]bool type", func(t *testing.T) { + field, arr, err := unpackColumn("test_bool", []bool{true, false, true, false, true}) + if err != nil { + t.Fatalf("unpackColumm failed: %v", err) + } + if field.Type != arrow.FixedWidthTypes.Boolean { + t.Errorf("Expected Boolean type, got %v", field.Type) + } + boolArr, ok := arr.(*array.Boolean) + if !ok { + t.Fatalf("Expected *array.Boolean, got %T", arr) + } + if boolArr.Len() != 5 { + t.Errorf("Expected 5 elements, got %d", boolArr.Len()) + } + expected := []bool{true, false, true, false, true} + for i, exp := range expected { + if boolArr.Value(i) != exp { + t.Errorf("Element %d: expected %v, got %v", i, exp, boolArr.Value(i)) + } + } + }) + + t.Run("Unsupported type - default case", func(t *testing.T) { + _, _, err := unpackColumn("test_unsupported", []byte{1, 2, 3}) + if err != nil { + t.Error("unexpected error for unsupported type") + } + + }) + + t.Run("Empty slices", func(t *testing.T) { + field, arr, err := unpackColumn("empty_int", []int{}) + if err != nil { + t.Fatalf("unpackColumm failed for empty slice: %v", err) + } + if arr.Len() != 0 { + t.Errorf("Expected 0 elements for empty slice, got %d", arr.Len()) + } + if field.Name != "empty_int" { + t.Errorf("Expected field name 'empty_int', got '%s'", field.Name) + } + }) +} + +// TestNewInMemoryProjectExec tests the constructor comprehensively +func TestNewInMemoryProjectExec(t *testing.T) { + t.Run("Valid construction with all types", func(t *testing.T) { + names := []string{ + "col_int", "col_int8", "col_int16", "col_int32", "col_int64", + "col_uint", "col_uint8", "col_uint16", "col_uint32", "col_uint64", + "col_float32", "col_float64", "col_string", "col_bool", + } + columns := []any{ + []int{1, 2}, + []int8{1, 2}, + []int16{1, 2}, + []int32{1, 2}, + []int64{1, 2}, + []uint{1, 2}, + []uint8{1, 2}, + []uint16{1, 2}, + []uint32{1, 2}, + []uint64{1, 2}, + []float32{1.1, 2.2}, + []float64{1.1, 2.2}, + []string{"a", "b"}, + []bool{true, false}, + } + + proj, err := NewInMemoryProjectExec(names, columns) + if err != nil { + t.Fatalf("NewInMemoryProjectExec failed: %v", err) + } + + if proj == nil { + t.Fatal("InMemoryProjectExec is nil") + } + if proj.schema == nil { + t.Fatal("Schema is nil") + } + if proj.columns == nil { + t.Fatal("Columns are nil") + } + if proj.schema.NumFields() != len(names) { + t.Errorf("Expected %d fields, got %d", len(names), proj.schema.NumFields()) + } + if len(proj.columns) != len(columns) { + t.Errorf("Expected %d columns, got %d", len(columns), len(proj.columns)) + } + + // Verify each field name matches + fields := proj.schema.Fields() + for i, expectedName := range names { + if fields[i].Name != expectedName { + t.Errorf("Field %d: expected name '%s', got '%s'", i, expectedName, fields[i].Name) + } + if !fields[i].Nullable { + t.Errorf("Field %d (%s): expected nullable=true", i, expectedName) + } + } + + // Verify each column has correct length + for i, col := range proj.columns { + if col.Len() != 2 { + t.Errorf("Column %d: expected length 2, got %d", i, col.Len()) + } + } + }) + + t.Run("Mismatched names and columns count", func(t *testing.T) { + names := []string{"col1", "col2"} + columns := []any{[]int{1, 2, 3}} + + _, err := NewInMemoryProjectExec(names, columns) + if err == nil { + t.Error("Expected error for mismatched names and columns, got nil") + } + }) + + t.Run("Unsupported type - supportedType returns false", func(t *testing.T) { + // Custom struct type is not supported + type CustomStruct struct { + ID int + Name string + } + + names := []string{"col1"} + columns := []any{[]CustomStruct{{1, "test"}, {2, "data"}}} + + _, err := NewInMemoryProjectExec(names, columns) + if err == nil { + t.Error("Expected error for unsupported type, got nil") + } + + }) + + t.Run("Single column", func(t *testing.T) { + names := []string{"only_col"} + columns := []any{[]int{10, 20, 30}} + + proj, err := NewInMemoryProjectExec(names, columns) + if err != nil { + t.Fatalf("NewInMemoryProjectExec failed: %v", err) + } + + if proj.schema.NumFields() != 1 { + t.Errorf("Expected 1 field, got %d", proj.schema.NumFields()) + } + if len(proj.columns) != 1 { + t.Errorf("Expected 1 column, got %d", len(proj.columns)) + } + if proj.columns[0].Len() != 3 { + t.Errorf("Expected column length 3, got %d", proj.columns[0].Len()) + } + }) + + t.Run("Empty columns", func(t *testing.T) { + names := []string{} + columns := []any{} + + proj, err := NewInMemoryProjectExec(names, columns) + if err != nil { + t.Fatalf("NewInMemoryProjectExec failed for empty input: %v", err) + } + + if proj.schema.NumFields() != 0 { + t.Errorf("Expected 0 fields, got %d", proj.schema.NumFields()) + } + if len(proj.columns) != 0 { + t.Errorf("Expected 0 columns, got %d", len(proj.columns)) + } + }) + + t.Run("Columns with different lengths - valid construction", func(t *testing.T) { + // Note: The function doesn't validate that all columns have the same length + // This is valid construction even though columns have different lengths + names := []string{"col1", "col2"} + columns := []any{ + []int{1, 2, 3}, + []string{"a", "b"}, + } + + proj, err := NewInMemoryProjectExec(names, columns) + if err != nil { + t.Fatalf("NewInMemoryProjectExec failed: %v", err) + } + + if proj.columns[0].Len() != 3 { + t.Errorf("Column 0: expected length 3, got %d", proj.columns[0].Len()) + } + if proj.columns[1].Len() != 2 { + t.Errorf("Column 1: expected length 2, got %d", proj.columns[1].Len()) + } + }) + + t.Run("Complex field names", func(t *testing.T) { + names := []string{"Column_1", "column-2", "Column.3", "column 4"} + columns := []any{ + []int{1}, + []int{2}, + []int{3}, + []int{4}, + } + + proj, err := NewInMemoryProjectExec(names, columns) + if err != nil { + t.Fatalf("NewInMemoryProjectExec failed: %v", err) + } + + fields := proj.schema.Fields() + for i, expectedName := range names { + if fields[i].Name != expectedName { + t.Errorf("Field %d: expected name '%s', got '%s'", i, expectedName, fields[i].Name) + } + } + }) +} + +// TestErrInvalidInMemoryDataType tests the error constructor +func TestErrInvalidInMemoryDataType(t *testing.T) { + testType := []byte{1, 2, 3} + err := ErrInvalidInMemoryDataType(testType) + + if err == nil { + t.Fatal("ErrInvalidInMemoryDataType returned nil") + } + + expectedMsg := "[]uint8 is not a supported in memory dataType for InMemoryProjectExec" + if err.Error() != expectedMsg { + t.Errorf("Expected error message '%s', got '%s'", expectedMsg, err.Error()) + } + + // Test with different type + testType2 := map[string]int{"key": 1} + err2 := ErrInvalidInMemoryDataType(testType2) + expectedMsg2 := "map[string]int is not a supported in memory dataType for InMemoryProjectExec" + if err2.Error() != expectedMsg2 { + t.Errorf("Expected error message '%s', got '%s'", expectedMsg2, err2.Error()) + } +} + +// TestSchemaFieldTypes verifies the correct Arrow types are assigned +func TestSchemaFieldTypes(t *testing.T) { + names := []string{ + "int", "int8", "int16", "int32", "int64", + "uint", "uint8", "uint16", "uint32", "uint64", + "float32", "float64", "string", "bool", + } + columns := []any{ + []int{1}, []int8{1}, []int16{1}, []int32{1}, []int64{1}, + []uint{1}, []uint8{1}, []uint16{1}, []uint32{1}, []uint64{1}, + []float32{1.0}, []float64{1.0}, []string{"a"}, []bool{true}, + } + + expectedTypes := []arrow.DataType{ + arrow.PrimitiveTypes.Int64, // []int -> Int64 + arrow.PrimitiveTypes.Int8, // []int8 -> Int8 + arrow.PrimitiveTypes.Int16, // []int16 -> Int16 + arrow.PrimitiveTypes.Int32, // []int32 -> Int32 + arrow.PrimitiveTypes.Int64, // []int64 -> Int64 + arrow.PrimitiveTypes.Uint64, // []uint -> Uint64 + arrow.PrimitiveTypes.Uint8, // []uint8 -> Uint8 + arrow.PrimitiveTypes.Uint16, // []uint16 -> Uint16 + arrow.PrimitiveTypes.Uint32, // []uint32 -> Uint32 + arrow.PrimitiveTypes.Uint64, // []uint64 -> Uint64 + arrow.PrimitiveTypes.Float32, // []float32 -> Float32 + arrow.PrimitiveTypes.Float64, // []float64 -> Float64 + arrow.BinaryTypes.String, // []string -> String + arrow.FixedWidthTypes.Boolean, // []bool -> Boolean + } + + proj, err := NewInMemoryProjectExec(names, columns) + if err != nil { + t.Fatalf("NewInMemoryProjectExec failed: %v", err) + } + + fields := proj.schema.Fields() + for i, expectedType := range expectedTypes { + if fields[i].Type != expectedType { + t.Errorf("Field %d (%s): expected type %v, got %v", + i, names[i], expectedType, fields[i].Type) + } + } +} + +func TestPruneSchema(t *testing.T) { + names, columns := generateTestColumns() + + t.Run("Select subset of fields", func(t *testing.T) { + // Create a fresh instance for this test + testProj, err := NewInMemoryProjectExec(names, columns) + if err != nil { + t.Fatalf("NewInMemoryProjectExec failed: %v", err) + } + + // Original schema should have 8 fields + originalFieldCount := testProj.schema.NumFields() + if originalFieldCount != 8 { + t.Errorf("Expected 8 fields in original schema, got %d", originalFieldCount) + } + + // Select only a subset of fields + selectedFields := []string{"id", "name", "salary"} + err = testProj.withFields(selectedFields...) + if err != nil { + t.Error("unexpected error when pruning columns") + } + + // After pruning, schema should have only 3 fields + prunedFieldCount := testProj.schema.NumFields() + if prunedFieldCount != 3 { + t.Errorf("Expected 3 fields after pruning, got %d", prunedFieldCount) + } + + // Verify the field names match + fields := testProj.schema.Fields() + for i, expectedName := range selectedFields { + if fields[i].Name != expectedName { + t.Errorf("Field %d: expected name '%s', got '%s'", i, expectedName, fields[i].Name) + } + } + + // Verify field order is preserved + if fields[0].Name != "id" || fields[1].Name != "name" || fields[2].Name != "salary" { + t.Error("Field order not preserved after pruning") + } + }) + + t.Run("Select single field", func(t *testing.T) { + // Create a fresh instance for this test + testProj, err := NewInMemoryProjectExec(names, columns) + if err != nil { + t.Fatalf("NewInMemoryProjectExec failed: %v", err) + } + + // Select only one field + err = testProj.withFields("department") + if err != nil { + t.Error("unexpected error when pruning columns") + } + + // After pruning, schema should have only 1 field + prunedFieldCount := testProj.schema.NumFields() + if prunedFieldCount != 1 { + t.Errorf("Expected 1 field after pruning, got %d", prunedFieldCount) + } + + // Verify the field name + fields := testProj.schema.Fields() + if fields[0].Name != "department" { + t.Errorf("Expected field name 'department', got '%s'", fields[0].Name) + } + + // Verify the field type is preserved (should be String since department is []string) + if fields[0].Type != arrow.BinaryTypes.String { + t.Errorf("Expected String type, got %v", fields[0].Type) + } + }) +} + +// TestNext tests the Next function with projection and iteration +func TestNext(t *testing.T) { + t.Run("Read all data in single batch", func(t *testing.T) { + names, columns := generateTestColumns() + proj, err := NewInMemoryProjectExec(names, columns) + if err != nil { + t.Fatalf("NewInMemoryProjectExec failed: %v", err) + } + + // Read all 10 rows in one batch + batch, err := proj.Next(100) + if err != nil { + t.Fatalf("Next failed: %v", err) + } + + if batch == nil { + t.Fatal("Expected batch, got nil") + } + + // Verify we got all 10 rows + if len(batch.Columns) != 8 { + t.Errorf("Expected 8 columns, got %d", len(batch.Columns)) + } + if batch.Columns[0].Len() != 10 { + t.Errorf("Expected 10 rows, got %d", batch.Columns[0].Len()) + } + + // Next call should return EOF + _, err = proj.Next(1) + if err != io.EOF { + t.Errorf("Expected EOF after reading all data, got: %v", err) + } + }) + + t.Run("Read with projection and iterate to EOF", func(t *testing.T) { + names, columns := generateTestColumns() + proj, err := NewInMemoryProjectExec(names, columns) + if err != nil { + t.Fatalf("NewInMemoryProjectExec failed: %v", err) + } + + // Project to only 3 columns + err = proj.withFields("id", "name", "salary") + if err != nil { + t.Error("unexpected error when pruning columns") + } + totalRowsRead := 0 + batchCount := 0 + + // Iterate until EOF + for { + batch, err := proj.Next(3) + if err == io.EOF { + break + } + if err != nil { + t.Fatalf("Next failed on batch %d: %v", batchCount+1, err) + } + + batchCount++ + totalRowsRead += batch.Columns[0].Len() + + // Verify projected schema has only 3 fields + if len(batch.Columns) != 3 { + t.Errorf("Batch %d: expected 3 columns after projection, got %d", batchCount, len(batch.Columns)) + } + + // Verify field names + fields := batch.Schema.Fields() + expectedNames := []string{"id", "name", "salary"} + for i, expectedName := range expectedNames { + if fields[i].Name != expectedName { + t.Errorf("Batch %d, Field %d: expected '%s', got '%s'", batchCount, i, expectedName, fields[i].Name) + } + } + } + + // Verify we read all 10 rows total + if totalRowsRead != 10 { + t.Errorf("Expected to read 10 total rows, got %d", totalRowsRead) + } + + // Verify we got 4 batches (3+3+3+1) + if batchCount != 4 { + t.Errorf("Expected 4 batches, got %d", batchCount) + } + }) + + t.Run("Multiple Next calls with small batch size", func(t *testing.T) { + names, columns := generateTestColumns() + proj, err := NewInMemoryProjectExec(names, columns) + if err != nil { + t.Fatalf("NewInMemoryProjectExec failed: %v", err) + } + + // Project to 2 columns + err = proj.withFields("age", "is_active") + if err != nil { + t.Error("unexpected error when pruning columns") + } + + // Read 2 rows at a time + batch1, err := proj.Next(2) + if err != nil { + t.Fatalf("First Next failed: %v", err) + } + if batch1.Columns[0].Len() != 2 { + t.Errorf("First batch: expected 2 rows, got %d", batch1.Columns[0].Len()) + } + + batch2, err := proj.Next(2) + if err != nil { + t.Fatalf("Second Next failed: %v", err) + } + if batch2.Columns[0].Len() != 2 { + t.Errorf("Second batch: expected 2 rows, got %d", batch2.Columns[0].Len()) + } + + // Continue reading until EOF + rowsRemaining := 0 + for { + batch, err := proj.Next(2) + if err == io.EOF { + break + } + if err != nil { + t.Fatalf("Next failed: %v", err) + } + rowsRemaining += batch.Columns[0].Len() + } + + // We read 4 rows in first two batches, so 6 should remain + if rowsRemaining != 6 { + t.Errorf("Expected 6 remaining rows, got %d", rowsRemaining) + } + }) + + t.Run("Single field projection with iteration", func(t *testing.T) { + names, columns := generateTestColumns() + proj, err := NewInMemoryProjectExec(names, columns) + if err != nil { + t.Fatalf("NewInMemoryProjectExec failed: %v", err) + } + + // Project to just the department column + err = proj.withFields("department") + if err != nil { + t.Error("unexpected error when pruning columns") + } + t.Logf("updated: %s\n", proj.schema) + t.Logf("new Mapping: %v\n", proj.fieldToColIDx) + t.Logf("new columns: %v\n", proj.columns) + + totalRows := 0 + for { + batch, err := proj.Next(5) + if err == io.EOF { + break + } + if err != nil { + t.Fatalf("Next failed: %v", err) + } + t.Logf("Batche schema: %v\n", batch.Schema) + t.Logf("Batch data: %v\n", batch.Columns) + + // Verify only 1 column + if len(batch.Columns) != 1 { + t.Errorf("Expected 1 column, got %d", len(batch.Columns)) + } + + // Verify it's a string array + if _, ok := batch.Columns[0].(*array.String); !ok { + t.Errorf("Expected *array.String, got %T", batch.Columns[0]) + } + + totalRows += batch.Columns[0].Len() + } + + if totalRows != 10 { + t.Errorf("Expected 10 total rows, got %d", totalRows) + } + }) +} diff --git a/src/Backend/opti-sql-go/operators/project/parquet.go b/src/Backend/opti-sql-go/operators/project/parquet.go new file mode 100644 index 0000000..50b04a4 --- /dev/null +++ b/src/Backend/opti-sql-go/operators/project/parquet.go @@ -0,0 +1,420 @@ +package project + +import ( + "context" + "errors" + "fmt" + "io" + "opti-sql-go/operators" + "opti-sql-go/operators/filter" + + "github.com/apache/arrow/go/v17/arrow" + "github.com/apache/arrow/go/v17/arrow/array" + "github.com/apache/arrow/go/v17/arrow/memory" + "github.com/apache/arrow/go/v17/parquet" + "github.com/apache/arrow/go/v17/parquet/file" + "github.com/apache/arrow/go/v17/parquet/pqarrow" +) + +var ( + _ = (operators.Operator)(&ParquetSource{}) +) + +type ParquetSource struct { + // existing fields + schema *arrow.Schema + projectionPushDown []string // columns to project up + predicatePushDown []filter.FilterExpr // simple predicate push down for now + reader pqarrow.RecordReader + // for internal reading + done bool // if set to true always return io.EOF +} + +func NewParquetSource(r parquet.ReaderAtSeeker) (*ParquetSource, error) { + allocator := memory.NewGoAllocator() + filerReader, err := file.NewParquetReader(r) + if err != nil { + return nil, err + } + + defer func() { + if err := filerReader.Close(); err != nil { + fmt.Printf("warning: failed to close parquet reader: %v\n", err) + } + }() + + arrowReader, err := pqarrow.NewFileReader( + filerReader, + pqarrow.ArrowReadProperties{Parallel: true, BatchSize: 5}, // TODO: Read in from config for this stuff + allocator, + ) + if err != nil { + return nil, err + } + rdr, err := arrowReader.GetRecordReader(context.TODO(), nil, nil) + if err != nil { + return nil, err + } + + return &ParquetSource{ + schema: rdr.Schema(), + projectionPushDown: []string{}, + predicatePushDown: nil, + reader: rdr, + }, nil + +} + +// source, columns you want to be push up the tree, any filters +func NewParquetSourcePushDown(r parquet.ReaderAtSeeker, columns []string, filters []filter.FilterExpr) (*ParquetSource, error) { + if len(columns) == 0 { + return nil, errors.New("no columns were provided for projection push down") + } + allocator := memory.NewGoAllocator() + filerReader, err := file.NewParquetReader(r) + if err != nil { + return nil, err + } + + defer func() { + if err := filerReader.Close(); err != nil { + fmt.Printf("warning: failed to close parquet reader: %v\n", err) + + } + }() + + arrowReader, err := pqarrow.NewFileReader( + filerReader, + pqarrow.ArrowReadProperties{Parallel: true, BatchSize: 5}, // TODO: Read in from config for this stuff + allocator, + ) + if err != nil { + return nil, err + } + var wantedColumnsIDX []int + s, _ := arrowReader.Schema() + for _, col := range columns { + idx_array := s.FieldIndices(col) + if len(idx_array) == 0 { + return nil, errors.New("unknown column passed in to be project push down") + } + wantedColumnsIDX = append(wantedColumnsIDX, idx_array...) + } + + rdr, err := arrowReader.GetRecordReader(context.TODO(), wantedColumnsIDX, nil) + if err != nil { + return nil, err + } + + return &ParquetSource{ + schema: rdr.Schema(), + projectionPushDown: columns, + predicatePushDown: filters, + reader: rdr, + }, nil +} + +// This should be 1 +func (ps *ParquetSource) Next(n uint16) (*operators.RecordBatch, error) { + if ps.reader == nil || ps.done || !ps.reader.Next() { + return nil, io.EOF + } + columns := make([]arrow.Array, len(ps.schema.Fields())) + curRow := 0 + for curRow < int(n) && ps.reader.Next() { + err := ps.reader.Err() + if err != nil { + return nil, err + } + record := ps.reader.Record() + numCols := int(record.NumCols()) + numRows := int(record.NumRows()) + + for colIdx := 0; colIdx < numCols; colIdx++ { + + batchCol := record.Column(colIdx) + existing := columns[colIdx] + // First time seeing this column → just assign it + if existing == nil { + batchCol.Retain() + columns[colIdx] = batchCol + continue + } + + // Otherwise combine existing + new batch column + combined := CombineArray(existing, batchCol) + + // Replace + columns[colIdx] = combined + + // VERY IMPORTANT: + // Release the old existing array to avoid leaks + existing.Release() + } + record.Release() + + curRow += numRows + } + return &operators.RecordBatch{ + Schema: ps.schema, // Remove the pointer as ps.Schema is already of type arrow.Schema + Columns: columns, + RowCount: uint64(curRow), + }, nil +} +func (ps *ParquetSource) Close() error { + ps.reader.Release() + ps.reader = nil + return nil +} +func (ps *ParquetSource) Schema() *arrow.Schema { + return ps.schema +} + +// append arr2 to arr1 so (arr1 + arr2) = arr1-arr2 +func CombineArray(a1, a2 arrow.Array) arrow.Array { + if a1 == nil { + return a2 + } + if a2 == nil { + return a1 + } + + mem := memory.NewGoAllocator() + dt := a1.DataType() + + switch dt.ID() { + + // -------------------- INT TYPES -------------------- + case arrow.INT8: + b := array.NewInt8Builder(mem) + appendInt8(b, a1.(*array.Int8)) + appendInt8(b, a2.(*array.Int8)) + return b.NewArray() + + case arrow.INT16: + b := array.NewInt16Builder(mem) + appendInt16(b, a1.(*array.Int16)) + appendInt16(b, a2.(*array.Int16)) + return b.NewArray() + + case arrow.INT32: + b := array.NewInt32Builder(mem) + appendInt32(b, a1.(*array.Int32)) + appendInt32(b, a2.(*array.Int32)) + return b.NewArray() + + case arrow.INT64: + b := array.NewInt64Builder(mem) + appendInt64(b, a1.(*array.Int64)) + appendInt64(b, a2.(*array.Int64)) + return b.NewArray() + + // -------------------- UINT TYPES -------------------- + case arrow.UINT8: + b := array.NewUint8Builder(mem) + appendUint8(b, a1.(*array.Uint8)) + appendUint8(b, a2.(*array.Uint8)) + return b.NewArray() + + case arrow.UINT16: + b := array.NewUint16Builder(mem) + appendUint16(b, a1.(*array.Uint16)) + appendUint16(b, a2.(*array.Uint16)) + return b.NewArray() + + case arrow.UINT32: + b := array.NewUint32Builder(mem) + appendUint32(b, a1.(*array.Uint32)) + appendUint32(b, a2.(*array.Uint32)) + return b.NewArray() + + case arrow.UINT64: + b := array.NewUint64Builder(mem) + appendUint64(b, a1.(*array.Uint64)) + appendUint64(b, a2.(*array.Uint64)) + return b.NewArray() + + // -------------------- FLOAT TYPES -------------------- + case arrow.FLOAT32: + b := array.NewFloat32Builder(mem) + appendFloat32(b, a1.(*array.Float32)) + appendFloat32(b, a2.(*array.Float32)) + return b.NewArray() + + case arrow.FLOAT64: + b := array.NewFloat64Builder(mem) + appendFloat64(b, a1.(*array.Float64)) + appendFloat64(b, a2.(*array.Float64)) + return b.NewArray() + + // -------------------- BOOLEAN -------------------- + case arrow.BOOL: + b := array.NewBooleanBuilder(mem) + appendBool(b, a1.(*array.Boolean)) + appendBool(b, a2.(*array.Boolean)) + return b.NewArray() + + // -------------------- STRING TYPES -------------------- + case arrow.STRING: + b := array.NewStringBuilder(mem) + appendString(b, a1.(*array.String)) + appendString(b, a2.(*array.String)) + return b.NewArray() + + case arrow.LARGE_STRING: + b := array.NewLargeStringBuilder(mem) + appendLargeString(b, a1.(*array.LargeString)) + appendLargeString(b, a2.(*array.LargeString)) + return b.NewArray() + + // -------------------- BINARY TYPES -------------------- + case arrow.BINARY: + b := array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + appendBinary(b, a1.(*array.Binary)) + appendBinary(b, a2.(*array.Binary)) + return b.NewArray() + + default: + panic(fmt.Sprintf("unsupported datatype in CombineArray: %v", dt)) + } +} + +func appendInt8(b *array.Int8Builder, c *array.Int8) { + for i := 0; i < c.Len(); i++ { + if c.IsNull(i) { + b.AppendNull() + continue + } + b.Append(c.Value(i)) + } +} + +func appendInt16(b *array.Int16Builder, c *array.Int16) { + for i := 0; i < c.Len(); i++ { + if c.IsNull(i) { + b.AppendNull() + continue + } + b.Append(c.Value(i)) + } +} + +func appendInt32(b *array.Int32Builder, c *array.Int32) { + for i := 0; i < c.Len(); i++ { + if c.IsNull(i) { + b.AppendNull() + continue + } + b.Append(c.Value(i)) + } +} + +func appendInt64(b *array.Int64Builder, c *array.Int64) { + for i := 0; i < c.Len(); i++ { + if c.IsNull(i) { + b.AppendNull() + continue + } + b.Append(c.Value(i)) + } +} + +func appendUint8(b *array.Uint8Builder, c *array.Uint8) { + for i := 0; i < c.Len(); i++ { + if c.IsNull(i) { + b.AppendNull() + continue + } + b.Append(c.Value(i)) + } +} + +func appendUint16(b *array.Uint16Builder, c *array.Uint16) { + for i := 0; i < c.Len(); i++ { + if c.IsNull(i) { + b.AppendNull() + continue + } + b.Append(c.Value(i)) + } +} + +func appendUint32(b *array.Uint32Builder, c *array.Uint32) { + for i := 0; i < c.Len(); i++ { + if c.IsNull(i) { + b.AppendNull() + continue + } + b.Append(c.Value(i)) + } +} + +func appendUint64(b *array.Uint64Builder, c *array.Uint64) { + for i := 0; i < c.Len(); i++ { + if c.IsNull(i) { + b.AppendNull() + continue + } + b.Append(c.Value(i)) + } +} + +func appendFloat32(b *array.Float32Builder, c *array.Float32) { + for i := 0; i < c.Len(); i++ { + if c.IsNull(i) { + b.AppendNull() + continue + } + b.Append(c.Value(i)) + } +} + +func appendFloat64(b *array.Float64Builder, c *array.Float64) { + for i := 0; i < c.Len(); i++ { + if c.IsNull(i) { + b.AppendNull() + continue + } + b.Append(c.Value(i)) + } +} + +func appendBool(b *array.BooleanBuilder, c *array.Boolean) { + for i := 0; i < c.Len(); i++ { + if c.IsNull(i) { + b.AppendNull() + continue + } + b.Append(c.Value(i)) + } +} + +func appendString(b *array.StringBuilder, c *array.String) { + for i := 0; i < c.Len(); i++ { + if c.IsNull(i) { + b.AppendNull() + continue + } + b.Append(c.Value(i)) + } +} + +func appendLargeString(b *array.LargeStringBuilder, c *array.LargeString) { + for i := 0; i < c.Len(); i++ { + if c.IsNull(i) { + b.AppendNull() + continue + } + b.Append(c.Value(i)) + } +} + +func appendBinary(b *array.BinaryBuilder, c *array.Binary) { + for i := 0; i < c.Len(); i++ { + if c.IsNull(i) { + b.AppendNull() + continue + } + b.Append(c.Value(i)) + } +} diff --git a/src/Backend/opti-sql-go/operators/project/parquet_test.go b/src/Backend/opti-sql-go/operators/project/parquet_test.go new file mode 100644 index 0000000..c383a07 --- /dev/null +++ b/src/Backend/opti-sql-go/operators/project/parquet_test.go @@ -0,0 +1,724 @@ +package project + +import ( + "io" + "os" + "testing" + + "github.com/apache/arrow/go/v17/arrow" + "github.com/apache/arrow/go/v17/arrow/array" + "github.com/apache/arrow/go/v17/arrow/memory" +) + +const ParquetTestDatafile = "../../../test_data/parquet/capitals_clean.parquet" + +func getTestParquetFile() *os.File { + file, err := os.Open(ParquetTestDatafile) + if err != nil { + panic(err) + } + return file +} + +/* +schema: + + fields: 5 + - country: type=utf8, nullable + metadata: ["PARQUET:field_id": "-1"] + - country_alpha2: type=utf8, nullable + metadata: ["PARQUET:field_id": "-1"] + - capital: type=utf8, nullable + metadata: ["PARQUET:field_id": "-1"] + - lat: type=float64, nullable + metadata: ["PARQUET:field_id": "-1"] + - lon: type=float64, nullable +*/ +// TODO: more to their own files later down the line +func existIn(str string, arr []string) bool { + for _, a := range arr { + if a == str { + return true + } + } + return false +} +func sameStringSlice(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} +func TestParquetInit(t *testing.T) { + t.Run("Test No names pass in", func(t *testing.T) { + f := getTestParquetFile() + + _, err := NewParquetSourcePushDown(f, []string{}, nil) + if err == nil { + t.Errorf("Expected error when no columns are passed in, but got nil") + } + }) + + t.Run("Test invalid names are passed in", func(t *testing.T) { + f := getTestParquetFile() + _, err := NewParquetSourcePushDown(f, []string{"non_existent_column"}, nil) + if err == nil { + t.Errorf("Expected error when invalid column names are passed in, but got nil") + } + }) + + t.Run("Test correct schema is returned", func(t *testing.T) { + f := getTestParquetFile() + columns := []string{"country", "capital", "lat"} + source, err := NewParquetSourcePushDown(f, columns, nil) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + schema := source.Schema() + if len(schema.Fields()) != len(columns) { + t.Errorf("Expected schema to have %d fields, got %d", len(columns), len(schema.Fields())) + } + for _, field := range schema.Fields() { + if !existIn(field.Name, columns) { + t.Errorf("Field %s not found in expected columns %v", field.Name, columns) + } + } + + }) + + t.Run("Test input columns and filters were passed back out", func(t *testing.T) { + f := getTestParquetFile() + columns := []string{"country", "capital", "lat"} + source, err := NewParquetSourcePushDown(f, columns, nil) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + if len(source.projectionPushDown) != len(columns) { + t.Errorf("Expected projectionPushDown to have %d columns, got %d", len(columns), len(source.projectionPushDown)) + } + if !sameStringSlice(source.projectionPushDown, columns) || source.predicatePushDown != nil { + t.Errorf("Expected projectionPushDown to be %v and predicatePushDown to be nil, got %v and %v", columns, source.projectionPushDown, source.predicatePushDown) + } + }) + + t.Run("Check reader isnt null", func(t *testing.T) { + + f := getTestParquetFile() + columns := []string{"country", "capital", "lat"} + source, err := NewParquetSourcePushDown(f, columns, nil) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + if source.reader == nil { + t.Errorf("Expected reader to be initialized, but got nil") + } + + }) + +} +func TestParquetClose(t *testing.T) { + f := getTestParquetFile() + columns := []string{"country", "capital", "lat"} + source, err := NewParquetSourcePushDown(f, columns, nil) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + err = source.Close() + if err != nil { + t.Errorf("Unexpected error on Close: %v", err) + } + if source.reader != nil { + t.Errorf("Expected reader to be nil after Close, but it is not") + } + _, err = source.Next(1) + if err != io.EOF { + t.Error("expected reader to return io.EOF") + } + +} +func TestRunToEnd(t *testing.T) { + f := getTestParquetFile() + columns := []string{"country", "capital", "lat"} + source, err := NewParquetSourcePushDown(f, columns, nil) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + for { + rc, err := source.Next(1024 * 8) + if err != nil { + if err == io.EOF { + break + } + t.Fatalf("Unexpected error on Next: %v", err) + } + t.Log("RecordBatch: ", rc) + } +} + +func TestParquetRead(t *testing.T) { + f := getTestParquetFile() + columns := []string{"country", "capital", "lat"} + source, err := NewParquetSourcePushDown(f, columns, nil) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + // batchSize := uint16(10) + rc, err := source.Next(uint16(15)) + if err != nil { + t.Fatalf("Unexpected error on Next: %v", err) + } + if rc == nil { + t.Fatalf("Expected RecordBatch, got nil") + } + if len(rc.Columns) != len(columns) { + t.Errorf("Expected %d columns, got %d", len(columns), len(rc.Columns)) + } + if rc.Schema.NumFields() != len(columns) { + t.Errorf("Expected schema to have %d fields, got %d", len(columns), rc.Schema.NumFields()) + } + t.Logf("columns:%v\n", rc.Columns) + t.Logf("count:%d\n", rc.RowCount) +} + +// CombineArray tests: cover primitive, uint, float, bool, string, binary and nil-handling +func TestCombineArray_Cases(t *testing.T) { + mem := memory.NewGoAllocator() + + t.Run("INT8", func(t *testing.T) { + ib1 := array.NewInt8Builder(mem) + ib1.Append(1) + ib1.AppendNull() + a1 := ib1.NewArray().(*array.Int8) + ib2 := array.NewInt8Builder(mem) + ib2.Append(2) + ib2.Append(3) + a2 := ib2.NewArray().(*array.Int8) + comb := CombineArray(a1, a2).(*array.Int8) + if comb.Len() != a1.Len()+a2.Len() { + t.Fatalf("int8 combined length wrong") + } + if comb.Value(0) != 1 || !comb.IsNull(1) || comb.Value(2) != 2 { + t.Fatalf("int8 values unexpected") + } + a1.Release() + a2.Release() + comb.Release() + }) + + t.Run("INT16", func(t *testing.T) { + i16b1 := array.NewInt16Builder(mem) + i16b1.Append(10) + i16b1.Append(20) + ia1 := i16b1.NewArray().(*array.Int16) + i16b2 := array.NewInt16Builder(mem) + i16b2.Append(30) + ia2 := i16b2.NewArray().(*array.Int16) + i16c := CombineArray(ia1, ia2).(*array.Int16) + if i16c.Len() != ia1.Len()+ia2.Len() { + t.Fatalf("int16 combined length") + } + ia1.Release() + ia2.Release() + i16c.Release() + }) + + t.Run("INT32", func(t *testing.T) { + i32b1 := array.NewInt32Builder(mem) + i32b1.Append(1) + ia32_1 := i32b1.NewArray().(*array.Int32) + i32b2 := array.NewInt32Builder(mem) + i32b2.Append(2) + ia32_2 := i32b2.NewArray().(*array.Int32) + i32c := CombineArray(ia32_1, ia32_2).(*array.Int32) + if i32c.Len() != 2 { + t.Fatalf("int32 combined length") + } + ia32_1.Release() + ia32_2.Release() + i32c.Release() + }) + + t.Run("INT64", func(t *testing.T) { + i64b1 := array.NewInt64Builder(mem) + i64b1.Append(100) + ia64_1 := i64b1.NewArray().(*array.Int64) + i64b2 := array.NewInt64Builder(mem) + i64b2.Append(200) + ia64_2 := i64b2.NewArray().(*array.Int64) + i64c := CombineArray(ia64_1, ia64_2).(*array.Int64) + if i64c.Len() != 2 { + t.Fatalf("int64 combined length") + } + ia64_1.Release() + ia64_2.Release() + i64c.Release() + }) + + t.Run("UINT8", func(t *testing.T) { + u8b1 := array.NewUint8Builder(mem) + u8b1.Append(8) + ua8_1 := u8b1.NewArray().(*array.Uint8) + u8b2 := array.NewUint8Builder(mem) + u8b2.Append(9) + ua8_2 := u8b2.NewArray().(*array.Uint8) + u8c := CombineArray(ua8_1, ua8_2).(*array.Uint8) + if u8c.Len() != 2 { + t.Fatalf("uint8 combined length") + } + ua8_1.Release() + ua8_2.Release() + u8c.Release() + }) + + t.Run("UINT16", func(t *testing.T) { + u16b1 := array.NewUint16Builder(mem) + u16b1.Append(16) + ua16_1 := u16b1.NewArray().(*array.Uint16) + u16b2 := array.NewUint16Builder(mem) + u16b2.Append(32) + ua16_2 := u16b2.NewArray().(*array.Uint16) + u16c := CombineArray(ua16_1, ua16_2).(*array.Uint16) + if u16c.Len() != 2 { + t.Fatalf("uint16 combined length") + } + ua16_1.Release() + ua16_2.Release() + u16c.Release() + }) + + t.Run("UINT32", func(t *testing.T) { + u32b1 := array.NewUint32Builder(mem) + u32b1.Append(1000) + ua32_1 := u32b1.NewArray().(*array.Uint32) + u32b2 := array.NewUint32Builder(mem) + u32b2.Append(2000) + ua32_2 := u32b2.NewArray().(*array.Uint32) + u32c := CombineArray(ua32_1, ua32_2).(*array.Uint32) + if u32c.Len() != 2 { + t.Fatalf("uint32 combined length") + } + ua32_1.Release() + ua32_2.Release() + u32c.Release() + }) + + t.Run("UINT64", func(t *testing.T) { + u64b1 := array.NewUint64Builder(mem) + u64b1.Append(10000) + ua64_1 := u64b1.NewArray().(*array.Uint64) + u64b2 := array.NewUint64Builder(mem) + u64b2.Append(20000) + ua64_2 := u64b2.NewArray().(*array.Uint64) + u64c := CombineArray(ua64_1, ua64_2).(*array.Uint64) + if u64c.Len() != 2 { + t.Fatalf("uint64 combined length") + } + ua64_1.Release() + ua64_2.Release() + u64c.Release() + }) + + t.Run("FLOAT32", func(t *testing.T) { + f32b1 := array.NewFloat32Builder(mem) + f32b1.Append(1.25) + fa32_1 := f32b1.NewArray().(*array.Float32) + f32b2 := array.NewFloat32Builder(mem) + f32b2.Append(2.5) + fa32_2 := f32b2.NewArray().(*array.Float32) + f32c := CombineArray(fa32_1, fa32_2).(*array.Float32) + if f32c.Len() != 2 { + t.Fatalf("float32 combined length") + } + fa32_1.Release() + fa32_2.Release() + f32c.Release() + }) + + t.Run("FLOAT64", func(t *testing.T) { + f64b1 := array.NewFloat64Builder(mem) + f64b1.Append(3.14) + fa64_1 := f64b1.NewArray().(*array.Float64) + f64b2 := array.NewFloat64Builder(mem) + f64b2.Append(6.28) + fa64_2 := f64b2.NewArray().(*array.Float64) + f64c := CombineArray(fa64_1, fa64_2).(*array.Float64) + if f64c.Len() != 2 { + t.Fatalf("float64 combined length") + } + fa64_1.Release() + fa64_2.Release() + f64c.Release() + }) + + t.Run("BOOL", func(t *testing.T) { + bb1 := array.NewBooleanBuilder(mem) + bb1.Append(true) + bb1.AppendNull() + ba1 := bb1.NewArray().(*array.Boolean) + bb2 := array.NewBooleanBuilder(mem) + bb2.Append(false) + ba2 := bb2.NewArray().(*array.Boolean) + bc := CombineArray(ba1, ba2).(*array.Boolean) + if bc.Len() != ba1.Len()+ba2.Len() { + t.Fatalf("bool combined length") + } + ba1.Release() + ba2.Release() + bc.Release() + }) + + t.Run("STRING", func(t *testing.T) { + sb1 := array.NewStringBuilder(mem) + sb1.Append("one") + sb1.AppendNull() + sa1 := sb1.NewArray().(*array.String) + sb2 := array.NewStringBuilder(mem) + sb2.Append("two") + sa2 := sb2.NewArray().(*array.String) + sc := CombineArray(sa1, sa2).(*array.String) + if sc.Len() != sa1.Len()+sa2.Len() { + t.Fatalf("string combined length") + } + sa1.Release() + sa2.Release() + sc.Release() + }) + + t.Run("LARGE_STRING", func(t *testing.T) { + lsb1 := array.NewLargeStringBuilder(mem) + lsb1.Append("big1") + la1 := lsb1.NewArray().(*array.LargeString) + lsb2 := array.NewLargeStringBuilder(mem) + lsb2.Append("big2") + la2 := lsb2.NewArray().(*array.LargeString) + lc := CombineArray(la1, la2).(*array.LargeString) + if lc.Len() != la1.Len()+la2.Len() { + t.Fatalf("large string combined length") + } + la1.Release() + la2.Release() + lc.Release() + }) + + t.Run("BINARY", func(t *testing.T) { + bbld := array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + bbld.Append([]byte("a")) + baBb1 := bbld.NewArray().(*array.Binary) + bbld2 := array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + bbld2.Append([]byte("b")) + baBb2 := bbld2.NewArray().(*array.Binary) + bcbin := CombineArray(baBb1, baBb2).(*array.Binary) + if bcbin.Len() != baBb1.Len()+baBb2.Len() { + t.Fatalf("binary combined length") + } + baBb1.Release() + baBb2.Release() + bcbin.Release() + }) + + t.Run("NIL_A1", func(t *testing.T) { + // build a small binary array to pass as second + bbld := array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + bbld.Append([]byte("z")) + sec := bbld.NewArray().(*array.Binary) + got := CombineArray(nil, sec) + if got == nil { + t.Fatalf("expected non-nil when a1 is nil") + } + if got != sec { // CombineArray will return sec directly when a1 is nil + got.Release() + } + sec.Release() + }) + + t.Run("NIL_A2", func(t *testing.T) { + bbld := array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + bbld.Append([]byte("y")) + first := bbld.NewArray().(*array.Binary) + got := CombineArray(first, nil) + if got == nil { + t.Fatalf("expected non-nil when a2 is nil") + } + if got != first { // CombineArray will return first directly when a2 is nil + got.Release() + } + first.Release() + }) +} + +// includes null values so append* helpers take the AppendNull branch. +func TestCombineArray_PerTypeNulls(t *testing.T) { + mem := memory.NewGoAllocator() + + t.Run("AppendUint16_nulls", func(t *testing.T) { + b1 := array.NewUint16Builder(mem) + b1.Append(11) + b1.AppendNull() + b1.Append(13) + a1 := b1.NewArray().(*array.Uint16) + + b2 := array.NewUint16Builder(mem) + b2.AppendNull() + b2.Append(15) + a2 := b2.NewArray().(*array.Uint16) + + out := CombineArray(a1, a2).(*array.Uint16) + if out.Len() != 5 { + t.Fatalf("uint16 expected len 5 got %d", out.Len()) + } + if !out.IsNull(1) || !out.IsNull(3) { + t.Fatalf("uint16 nulls not preserved") + } + a1.Release() + a2.Release() + out.Release() + }) + + t.Run("AppendInt16_nulls", func(t *testing.T) { + b1 := array.NewInt16Builder(mem) + b1.Append(21) + b1.AppendNull() + a1 := b1.NewArray().(*array.Int16) + b2 := array.NewInt16Builder(mem) + b2.AppendNull() + b2.Append(23) + a2 := b2.NewArray().(*array.Int16) + out := CombineArray(a1, a2).(*array.Int16) + if out.Len() != 4 { + t.Fatalf("int16 expected len 4 got %d", out.Len()) + } + if !out.IsNull(1) || !out.IsNull(2) { + t.Fatalf("int16 nulls not present") + } + a1.Release() + a2.Release() + out.Release() + }) + + t.Run("AppendInt32_nulls", func(t *testing.T) { + b1 := array.NewInt32Builder(mem) + b1.Append(31) + b1.AppendNull() + a1 := b1.NewArray().(*array.Int32) + b2 := array.NewInt32Builder(mem) + b2.AppendNull() + b2.Append(33) + a2 := b2.NewArray().(*array.Int32) + out := CombineArray(a1, a2).(*array.Int32) + if !out.IsNull(1) || !out.IsNull(2) { + t.Fatalf("int32 nulls not present") + } + a1.Release() + a2.Release() + out.Release() + }) + t.Run("AppendUint32_nulls", func(t *testing.T) { + b1 := array.NewUint32Builder(mem) + b1.AppendNull() + b1.Append(22) + a1 := b1.NewArray().(*array.Uint32) + b2 := array.NewUint32Builder(mem) + b2.Append(23) + b2.AppendNull() + a2 := b2.NewArray().(*array.Uint32) + out := CombineArray(a1, a2).(*array.Uint32) + if !out.IsNull(0) || !out.IsNull(3) { + t.Fatalf("uint32 nulls not present") + } + a1.Release() + a2.Release() + out.Release() + }) + + t.Run("AppendInt64_nulls", func(t *testing.T) { + b1 := array.NewInt64Builder(mem) + b1.AppendNull() + b1.Append(41) + a1 := b1.NewArray().(*array.Int64) + b2 := array.NewInt64Builder(mem) + b2.Append(42) + b2.AppendNull() + a2 := b2.NewArray().(*array.Int64) + out := CombineArray(a1, a2).(*array.Int64) + if !out.IsNull(0) || !out.IsNull(3) { + t.Fatalf("int64 nulls not present") + } + a1.Release() + a2.Release() + out.Release() + }) + + t.Run("AppendUint64_nulls", func(t *testing.T) { + b1 := array.NewUint64Builder(mem) + b1.AppendNull() + b1.Append(41) + a1 := b1.NewArray().(*array.Uint64) + b2 := array.NewUint64Builder(mem) + b2.Append(42) + b2.AppendNull() + a2 := b2.NewArray().(*array.Uint64) + out := CombineArray(a1, a2).(*array.Uint64) + if !out.IsNull(0) || !out.IsNull(3) { + t.Fatalf("Uint64 nulls not present") + } + a1.Release() + a2.Release() + out.Release() + + }) + + t.Run("AppendUint8_nulls", func(t *testing.T) { + b1 := array.NewUint8Builder(mem) + b1.AppendNull() + b1.Append(2) + a1 := b1.NewArray().(*array.Uint8) + b2 := array.NewUint8Builder(mem) + b2.Append(3) + b2.AppendNull() + a2 := b2.NewArray().(*array.Uint8) + out := CombineArray(a1, a2).(*array.Uint8) + if !out.IsNull(0) || !out.IsNull(3) { + t.Fatalf("uint8 nulls not present") + } + a1.Release() + a2.Release() + out.Release() + }) + + t.Run("AppendFloat32_nulls", func(t *testing.T) { + b1 := array.NewFloat32Builder(mem) + b1.Append(1.5) + b1.AppendNull() + a1 := b1.NewArray().(*array.Float32) + b2 := array.NewFloat32Builder(mem) + b2.AppendNull() + b2.Append(2.5) + a2 := b2.NewArray().(*array.Float32) + out := CombineArray(a1, a2).(*array.Float32) + if !out.IsNull(1) || !out.IsNull(2) { + t.Fatalf("float32 nulls not present") + } + a1.Release() + a2.Release() + out.Release() + }) + + t.Run("AppendFloat64_nulls", func(t *testing.T) { + b1 := array.NewFloat64Builder(mem) + b1.AppendNull() + b1.Append(3.14) + a1 := b1.NewArray().(*array.Float64) + b2 := array.NewFloat64Builder(mem) + b2.Append(4.14) + b2.AppendNull() + a2 := b2.NewArray().(*array.Float64) + out := CombineArray(a1, a2).(*array.Float64) + if !out.IsNull(0) || !out.IsNull(3) { + t.Fatalf("float64 nulls not present") + } + a1.Release() + a2.Release() + out.Release() + }) + + t.Run("AppendBool_nulls", func(t *testing.T) { + b1 := array.NewBooleanBuilder(mem) + b1.Append(true) + b1.AppendNull() + a1 := b1.NewArray().(*array.Boolean) + b2 := array.NewBooleanBuilder(mem) + b2.AppendNull() + b2.Append(false) + a2 := b2.NewArray().(*array.Boolean) + out := CombineArray(a1, a2).(*array.Boolean) + if !out.IsNull(1) || !out.IsNull(2) { + t.Fatalf("bool nulls not present") + } + a1.Release() + a2.Release() + out.Release() + }) + + t.Run("AppendString_nulls", func(t *testing.T) { + b1 := array.NewStringBuilder(mem) + b1.Append("s1") + b1.AppendNull() + a1 := b1.NewArray().(*array.String) + b2 := array.NewStringBuilder(mem) + b2.AppendNull() + b2.Append("s2") + a2 := b2.NewArray().(*array.String) + out := CombineArray(a1, a2).(*array.String) + if !out.IsNull(1) || !out.IsNull(2) { + t.Fatalf("string nulls not present") + } + a1.Release() + a2.Release() + out.Release() + }) + + t.Run("AppendLargeString_nulls", func(t *testing.T) { + b1 := array.NewLargeStringBuilder(mem) + b1.AppendNull() + b1.Append("L1") + a1 := b1.NewArray().(*array.LargeString) + b2 := array.NewLargeStringBuilder(mem) + b2.Append("L2") + b2.AppendNull() + a2 := b2.NewArray().(*array.LargeString) + out := CombineArray(a1, a2).(*array.LargeString) + if !out.IsNull(0) || !out.IsNull(3) { + t.Fatalf("large string nulls not present") + } + a1.Release() + a2.Release() + out.Release() + }) + + t.Run("AppendBinary_nulls", func(t *testing.T) { + b1 := array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + b1.AppendNull() + b1.Append([]byte("bb1")) + a1 := b1.NewArray().(*array.Binary) + b2 := array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) + b2.Append([]byte("bb2")) + b2.AppendNull() + a2 := b2.NewArray().(*array.Binary) + out := CombineArray(a1, a2).(*array.Binary) + if !out.IsNull(0) || !out.IsNull(3) { + t.Fatalf("binary nulls not present") + } + a1.Release() + a2.Release() + out.Release() + }) + +} +func TestCombineArray_UnsupportedType(t *testing.T) { + mem := memory.NewGoAllocator() + + // Build a FixedSizeBinary array (NOT supported in your switch) + dt := &arrow.FixedSizeBinaryType{ByteWidth: 4} + b := array.NewFixedSizeBinaryBuilder(mem, dt) + b.Append([]byte{0, 1, 2, 3}) + b.Append([]byte{4, 5, 6, 7}) + arr := b.NewArray() + b.Release() + + defer arr.Release() + + // Expect panic + defer func() { + if r := recover(); r == nil { + t.Fatalf("expected panic for unsupported datatype") + } + }() + + // Call CombineArray with unsupported type + _ = CombineArray(arr, arr) +} diff --git a/src/Backend/opti-sql-go/operators/project/projectExec.go b/src/Backend/opti-sql-go/operators/project/projectExec.go index 7dac6f1..f633416 100644 --- a/src/Backend/opti-sql-go/operators/project/projectExec.go +++ b/src/Backend/opti-sql-go/operators/project/projectExec.go @@ -1 +1,123 @@ package project + +import ( + "errors" + "io" + "opti-sql-go/operators" + + "github.com/apache/arrow/go/v17/arrow" +) + +var ( + _ = (operators.Operator)(&ProjectExec{}) +) + +var ( + ErrProjectColumnNotFound = errors.New("project: column not found") + ErrEmptyColumnsToProject = errors.New("project: no columns to project") +) + +type ProjectExec struct { + child operators.Operator + outputschema arrow.Schema + columnsToKeep []string + done bool +} + +// columns to keep and existing schema +func NewProjectExec(projectColumns []string, input operators.Operator) (*ProjectExec, error) { + newSchema, err := prunedSchema(input.Schema(), projectColumns) + if err != nil { + return nil, err + } + // return new exec + return &ProjectExec{ + child: input, + outputschema: *newSchema, + columnsToKeep: projectColumns, + }, nil +} + +// pretty simple, read from child operator and prune columns +// pass through error && handles EOF alike +func (p *ProjectExec) Next(n uint16) (*operators.RecordBatch, error) { + if p.done { + return nil, io.EOF + } + + rc, err := p.child.Next(n) + if err != nil { + return nil, err + } + _, orderCols, err := ProjectSchemaFilterDown(rc.Schema, rc.Columns, p.columnsToKeep...) + if err != nil { + return nil, err + } + for _, c := range rc.Columns { + c.Release() + } + if rc.RowCount == 0 { + p.done = true + } + return &operators.RecordBatch{ + Schema: &p.outputschema, + Columns: orderCols, + RowCount: rc.RowCount, + }, nil +} +func (p *ProjectExec) Close() error { + return nil +} +func (p *ProjectExec) Schema() *arrow.Schema { + return &p.outputschema +} + +// handle keeping only the request columns but make sure the schema and columns are also aligned +// returns error if a column doesnt exist +func ProjectSchemaFilterDown(schema *arrow.Schema, cols []arrow.Array, keepCols ...string) (*arrow.Schema, []arrow.Array, error) { + if len(keepCols) == 0 { + return arrow.NewSchema([]arrow.Field{}, nil), nil, ErrEmptyColumnsToProject + } + + // Build map: columnName -> original index + fieldIndex := make(map[string]int) // age -> 0 + for i, f := range schema.Fields() { + fieldIndex[f.Name] = i + } + + newFields := make([]arrow.Field, 0, len(keepCols)) + newCols := make([]arrow.Array, 0, len(keepCols)) + + // Preserve order from keepCols, not schema order + for _, name := range keepCols { + idx, exists := fieldIndex[name] + if !exists { + return arrow.NewSchema([]arrow.Field{}, nil), []arrow.Array{}, ErrProjectColumnNotFound + } + + newFields = append(newFields, schema.Field(idx)) + col := cols[idx] + col.Retain() + newCols = append(newCols, col) + } + + newSchema := arrow.NewSchema(newFields, nil) + return newSchema, newCols, nil +} + +func prunedSchema(schema *arrow.Schema, keepCols []string) (*arrow.Schema, error) { + if len(keepCols) == 0 { + return arrow.NewSchema([]arrow.Field{}, nil), ErrEmptyColumnsToProject + } + newFields := make([]arrow.Field, 0) + for _, colName := range keepCols { + idx := schema.FieldIndices(colName) + if len(idx) == 0 { + return nil, ErrProjectColumnNotFound + } + // append the field + newFields = append(newFields, schema.Field(idx[0])) + } + newSchema := arrow.NewSchema(newFields, nil) + return newSchema, nil +} diff --git a/src/Backend/opti-sql-go/operators/project/projectExec_test.go b/src/Backend/opti-sql-go/operators/project/projectExec_test.go index 404af84..04a0ecd 100644 --- a/src/Backend/opti-sql-go/operators/project/projectExec_test.go +++ b/src/Backend/opti-sql-go/operators/project/projectExec_test.go @@ -1,7 +1,275 @@ package project -import "testing" +import ( + "errors" + "io" + "testing" -func TestProjectExec(t *testing.T) { + "github.com/apache/arrow/go/v17/arrow" +) + +func TestProjectExecInit(t *testing.T) { // Simple passing test } + +func TestProjectPrune(t *testing.T) { + fields := []arrow.Field{ + {Name: "id", Type: arrow.PrimitiveTypes.Int64}, + {Name: "name", Type: arrow.BinaryTypes.String}, + {Name: "age", Type: arrow.PrimitiveTypes.Int64}, + {Name: "country", Type: arrow.BinaryTypes.String}, + {Name: "email", Type: arrow.BinaryTypes.String}, + {Name: "signup_date", Type: arrow.FixedWidthTypes.Date32}, + } + schema := arrow.NewSchema(fields, nil) + t.Run("validate prune 1", func(t *testing.T) { + keepCols := []string{"id", "name", "email"} + newSchema, err := prunedSchema(schema, keepCols) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if newSchema.NumFields() != len(keepCols) { + t.Fatalf("expected %d fields, got %d", len(keepCols), newSchema.NumFields()) + } + for i, field := range newSchema.Fields() { + if field.Name != keepCols[i] { + t.Fatalf("expected field %s, got %s", keepCols[i], field.Name) + } + } + t.Logf("%s\n", newSchema) + }) + t.Run("validate prune 2", func(t *testing.T) { + keeptCols := []string{"age", "country", "signup_date"} + newSchema, err := prunedSchema(schema, keeptCols) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if newSchema.NumFields() != len(keeptCols) { + t.Fatalf("expected %d fields, got %d", len(keeptCols), newSchema.NumFields()) + } + for i, field := range newSchema.Fields() { + if field.Name != keeptCols[i] { + t.Fatalf("expected field %s, got %s", keeptCols[i], field.Name) + } + } + t.Logf("%s\n", newSchema) + + }) + t.Run("prune non-existant column", func(t *testing.T) { + keepCols := []string{"id", "non_existing_column"} + _, err := prunedSchema(schema, keepCols) + if err == nil { + t.Fatalf("expected error for non-existing column, got nil") + } + if !errors.Is(err, ErrProjectColumnNotFound) { + t.Fatalf("expected ErrProjectColumnNotFound, got %v", err) + } + + }) + t.Run("Prune empty input keepcols", func(t *testing.T) { + keepCols := []string{} + _, err := prunedSchema(schema, keepCols) + if err == nil { + t.Fatalf("expected error for empty keepcols, got nil") + } + if !errors.Is(err, ErrEmptyColumnsToProject) { + t.Fatalf("expected ErrEmptyColumnsToProject, got %v", err) + } + }) + +} +func TestProjectExec(t *testing.T) { + names, col := generateTestColumns() + memorySource, err := NewInMemoryProjectExec(names, col) + if err != nil { + t.Fatalf("failed to create in memory source: %v", err) + } + t.Logf("original schema %v\n", memorySource.Schema()) + projectExec, err := NewProjectExec([]string{"id", "name", "age"}, memorySource) + if err != nil { + t.Fatalf("failed to create project exec: %v", err) + } + rc, err := projectExec.Next(3) + if err != nil { + t.Fatalf("failed to get next record batch: %v", err) + } + t.Logf("rc:%v\n", rc) + +} + +// NewProjectExec, pruned schema errors and iteration behavior. +func TestProjectExec_Subtests(t *testing.T) { + names, cols := generateTestColumns() + + t.Run("ValidProjection", func(t *testing.T) { + memSrc, err := NewInMemoryProjectExec(names, cols) + if err != nil { + t.Fatalf("failed to create in memory source: %v", err) + } + projCols := []string{"id", "name", "age"} + projExec, err := NewProjectExec(projCols, memSrc) + if err != nil { + t.Fatalf("failed to create project exec: %v", err) + } + rb, err := projExec.Next(4) + if err != nil { + t.Fatalf("Next failed: %v", err) + } + if rb == nil { + t.Fatalf("expected a record batch, got nil") + } + if len(rb.Columns) != len(projCols) { + t.Fatalf("expected %d columns, got %d", len(projCols), len(rb.Columns)) + } + for _, c := range rb.Columns { + c.Release() + } + }) + + t.Run("EmptyColumns", func(t *testing.T) { + memSrc, err := NewInMemoryProjectExec(names, cols) + if err != nil { + t.Fatalf("failed to create in memory source: %v", err) + } + _, err = NewProjectExec([]string{}, memSrc) + if err == nil { + t.Fatalf("expected error for empty project columns, got nil") + } + if !errors.Is(err, ErrEmptyColumnsToProject) { + t.Fatalf("expected ErrEmptyColumnsToProject, got %v", err) + } + }) + + t.Run("NonExistentColumn", func(t *testing.T) { + memSrc, err := NewInMemoryProjectExec(names, cols) + if err != nil { + t.Fatalf("failed to create in memory source: %v", err) + } + _, err = NewProjectExec([]string{"id", "nope"}, memSrc) + if err == nil { + t.Fatalf("expected error for non-existent column, got nil") + } + if !errors.Is(err, ErrProjectColumnNotFound) { + t.Fatalf("expected ErrProjectColumnNotFound, got %v", err) + } + }) + + t.Run("SchemaMatch", func(t *testing.T) { + memSrc, err := NewInMemoryProjectExec(names, cols) + if err != nil { + t.Fatalf("failed to create in memory source: %v", err) + } + projCols := []string{"id", "name"} + projExec, err := NewProjectExec(projCols, memSrc) + if err != nil { + t.Fatalf("failed to create project exec: %v", err) + } + execSchema := projExec.Schema() + pruned, err := prunedSchema(memSrc.Schema(), projCols) + if err != nil { + t.Fatalf("prunedSchema failed: %v", err) + } + if !execSchema.Equal(pruned) { + t.Fatalf("expected exec schema %v, got %v", pruned, execSchema) + } + _ = projExec + }) + + t.Run("IterateUntilEOF", func(t *testing.T) { + memSrc, err := NewInMemoryProjectExec(names, cols) + if err != nil { + t.Fatalf("failed to create in memory source: %v", err) + } + projExec, err := NewProjectExec([]string{"id", "name"}, memSrc) + if err != nil { + t.Fatalf("failed to create project exec: %v", err) + } + total := 0 + batches := 0 + for { + rb, err := projExec.Next(3) + if err != nil { + if errors.Is(err, io.EOF) { + break + } + t.Fatalf("Next returned unexpected error: %v", err) + } + if rb == nil { + t.Fatalf("expected record batch, got nil") + } + total += int(rb.Columns[0].Len()) + batches++ + for _, c := range rb.Columns { + c.Release() + } + } + if batches == 0 { + t.Fatalf("expected at least 1 batch, got 0") + } + }) + + t.Run("SingleColumnProjection", func(t *testing.T) { + memSrc, err := NewInMemoryProjectExec(names, cols) + if err != nil { + t.Fatalf("failed to create in memory source: %v", err) + } + projExec, err := NewProjectExec([]string{"department"}, memSrc) + if err != nil { + t.Fatalf("failed to create project exec: %v", err) + } + total := 0 + for { + rb, err := projExec.Next(5) + if err != nil { + if errors.Is(err, io.EOF) { + break + } + t.Fatalf("Next returned unexpected error: %v", err) + } + if len(rb.Columns) != 1 { + t.Fatalf("expected 1 column, got %d", len(rb.Columns)) + } + total += int(rb.Columns[0].Len()) + for _, c := range rb.Columns { + c.Release() + } + } + }) + t.Run("Check Close", func(t *testing.T) { + memSrc, err := NewInMemoryProjectExec(names, cols) + if err != nil { + t.Fatalf("failed to create in memory source: %v", err) + } + projExec, err := NewProjectExec([]string{"department"}, memSrc) + if err != nil { + t.Fatalf("failed to create project exec: %v", err) + } + err = projExec.Close() + if err != nil { + t.Fatalf("expected no error on Close, got %v", err) + } + + }) + t.Run("Empty ProjectFilter", func(t *testing.T) { + memSrc, err := NewInMemoryProjectExec(names, cols) + if err != nil { + t.Fatalf("failed to create in memory source: %v", err) + } + _, _, err = ProjectSchemaFilterDown(memSrc.Schema(), memSrc.columns, []string{}...) + if err == nil { + t.Fatalf("expected error for empty project filter, got nil") + } + if !errors.Is(err, ErrEmptyColumnsToProject) { + t.Fatalf("expected ErrEmptyColumnsToProject, got %v", err) + } + _, _, err = ProjectSchemaFilterDown(memSrc.Schema(), memSrc.columns, []string{"This column doesnt exist"}...) + if err == nil { + t.Fatalf("expected error for non-existent column in project filter, got nil") + } + if !errors.Is(err, ErrProjectColumnNotFound) { + t.Fatalf("expected ErrProjectColumnNotFound, got %v", err) + } + + }) + +} diff --git a/src/Backend/opti-sql-go/operators/project/s3.go b/src/Backend/opti-sql-go/operators/project/s3.go new file mode 100644 index 0000000..b418503 --- /dev/null +++ b/src/Backend/opti-sql-go/operators/project/s3.go @@ -0,0 +1,105 @@ +package project + +import ( + "fmt" + "io" + "opti-sql-go/config" + "os" + "time" + + "github.com/minio/minio-go" +) + +var secretes = config.GetConfig().Secretes + +type NetworkResource struct { + client *minio.Client + bucket string + key string + + // raw streaming object for CSV + stream *minio.Object + // for clean up-testing + fileName string +} + +func NewStreamReader(fileName string) (*NetworkResource, error) { + accessKey := secretes.AccessKey + secretKey := secretes.SecretKey + endpoint := secretes.EndpointURL + bucket := secretes.BucketName + useSSL := true + + client, err := minio.New(endpoint, accessKey, secretKey, useSSL) + if err != nil { + return nil, err + } + + obj, err := client.GetObject(bucket, fileName, minio.GetObjectOptions{}) + if err != nil { + return nil, err + } + + return &NetworkResource{ + client: client, + bucket: bucket, + key: fileName, + fileName: fileName, + stream: obj, // CSV reads this directly + }, nil +} + +func (n *NetworkResource) Stream() io.Reader { + return n.stream +} + +// S3ReaderAt implements io.ReaderAt for Parquet readers +func (n *NetworkResource) ReadAt(p []byte, off int64) (int, error) { + opts := minio.GetObjectOptions{} + _ = opts.SetRange(off, off+int64(len(p))-1) + + obj, err := n.client.GetObject(n.bucket, n.key, opts) + if err != nil { + return 0, err + } + return io.ReadFull(obj, p) +} + +func (n *NetworkResource) Seek(offset int64, whence int) (int64, error) { + switch whence { + case io.SeekStart: + return offset, nil + case io.SeekEnd: + // Need to return total object size + info, err := n.client.StatObject(n.bucket, n.key, minio.StatObjectOptions{}) + if err != nil { + return 0, fmt.Errorf("failed to stat object: %w", err) + } + return info.Size, nil + default: + return 0, fmt.Errorf("unsupported seek mode for S3: %d", whence) + } +} +func (n *NetworkResource) DownloadLocally() (*os.File, error) { + f, err := os.Create(fmt.Sprintf("%s-%d", n.key, time.Now().UnixNano())) + if err != nil { + return nil, err + } + + // Read entire stream once + content, err := io.ReadAll(n.stream) + if err != nil { + return nil, err + } + + if _, err := f.Write(content); err != nil { + return nil, err + } + + // Rewind so CSV readers can start from beginning + if _, err := f.Seek(0, io.SeekStart); err != nil { + return nil, err + } + + return f, nil +} diff --git a/src/Backend/opti-sql-go/operators/project/source/csv.go b/src/Backend/opti-sql-go/operators/project/source/csv.go deleted file mode 100644 index d150341..0000000 --- a/src/Backend/opti-sql-go/operators/project/source/csv.go +++ /dev/null @@ -1 +0,0 @@ -package source diff --git a/src/Backend/opti-sql-go/operators/project/source/csv_test.go b/src/Backend/opti-sql-go/operators/project/source/csv_test.go deleted file mode 100644 index c00d2dd..0000000 --- a/src/Backend/opti-sql-go/operators/project/source/csv_test.go +++ /dev/null @@ -1,7 +0,0 @@ -package source - -import "testing" - -func TestCsv(t *testing.T) { - // Simple passing test -} diff --git a/src/Backend/opti-sql-go/operators/project/source/custom.go b/src/Backend/opti-sql-go/operators/project/source/custom.go deleted file mode 100644 index 9a323c3..0000000 --- a/src/Backend/opti-sql-go/operators/project/source/custom.go +++ /dev/null @@ -1,4 +0,0 @@ -package source - -// in memory format just for the ease of testing -// same as other sources, we can use structs/slices here diff --git a/src/Backend/opti-sql-go/operators/project/source/custom_test.go b/src/Backend/opti-sql-go/operators/project/source/custom_test.go deleted file mode 100644 index 53df34f..0000000 --- a/src/Backend/opti-sql-go/operators/project/source/custom_test.go +++ /dev/null @@ -1,7 +0,0 @@ -package source - -import "testing" - -func TestCustom(t *testing.T) { - // Simple passing test -} diff --git a/src/Backend/opti-sql-go/operators/project/source/json.go b/src/Backend/opti-sql-go/operators/project/source/json.go deleted file mode 100644 index d150341..0000000 --- a/src/Backend/opti-sql-go/operators/project/source/json.go +++ /dev/null @@ -1 +0,0 @@ -package source diff --git a/src/Backend/opti-sql-go/operators/project/source/json_test.go b/src/Backend/opti-sql-go/operators/project/source/json_test.go deleted file mode 100644 index 482bb43..0000000 --- a/src/Backend/opti-sql-go/operators/project/source/json_test.go +++ /dev/null @@ -1,7 +0,0 @@ -package source - -import "testing" - -func TestJson(t *testing.T) { - // Simple passing test -} diff --git a/src/Backend/opti-sql-go/operators/project/source/parquet.go b/src/Backend/opti-sql-go/operators/project/source/parquet.go deleted file mode 100644 index d150341..0000000 --- a/src/Backend/opti-sql-go/operators/project/source/parquet.go +++ /dev/null @@ -1 +0,0 @@ -package source diff --git a/src/Backend/opti-sql-go/operators/project/source/parquet_test.go b/src/Backend/opti-sql-go/operators/project/source/parquet_test.go deleted file mode 100644 index f677b80..0000000 --- a/src/Backend/opti-sql-go/operators/project/source/parquet_test.go +++ /dev/null @@ -1,7 +0,0 @@ -package source - -import "testing" - -func TestParquet(t *testing.T) { - // Simple passing test -} diff --git a/src/Backend/opti-sql-go/operators/project/source/s3.go b/src/Backend/opti-sql-go/operators/project/source/s3.go deleted file mode 100644 index d150341..0000000 --- a/src/Backend/opti-sql-go/operators/project/source/s3.go +++ /dev/null @@ -1 +0,0 @@ -package source diff --git a/src/Backend/opti-sql-go/operators/project/source/s3_test.go b/src/Backend/opti-sql-go/operators/project/source/s3_test.go deleted file mode 100644 index f62698f..0000000 --- a/src/Backend/opti-sql-go/operators/project/source/s3_test.go +++ /dev/null @@ -1,7 +0,0 @@ -package source - -import "testing" - -func TestS3(t *testing.T) { - // Simple passing test -} diff --git a/src/Backend/opti-sql-go/operators/project/source_test.go b/src/Backend/opti-sql-go/operators/project/source_test.go new file mode 100644 index 0000000..facce88 --- /dev/null +++ b/src/Backend/opti-sql-go/operators/project/source_test.go @@ -0,0 +1,304 @@ +package project + +import ( + "io" + "os" + "strings" + "testing" +) + +const ( + s3CSVFile = "country_full.csv" + s3ParquetFile = "userdata.parquet" + s3TxtFile = "example.txt" +) + +// test s3 as a source first then run test for other source files here +func TestS3(t *testing.T) { + // Simple passing test + _, err := NewStreamReader(s3CSVFile) + if err != nil { + t.Fatalf("failed to create s3 stream reader: %v", err) + } +} + +// test for +// (1) reading files from network (s3) should provide exact same abstraction as a local file +func TestS3BasicRead(t *testing.T) { + t.Run("csv read", func(t *testing.T) { + nr, err := NewStreamReader(s3CSVFile) + if err != nil { + t.Fatalf("failed to create s3 object: %v", err) + } + firstKB := make([]byte, 1024) + n, err := nr.stream.Read(firstKB) + if err != nil { + t.Fatalf("failed to read from s3 object: %v", err) + } + if n != 1024 { + t.Fatalf("expected to read 1024 bytes, but read %d bytes", n) + } + t.Logf("returned contents %s\n", firstKB[:n]) + + }) + t.Run("parquet read", func(t *testing.T) { + nr, err := NewStreamReader(s3ParquetFile) + if err != nil { + t.Fatalf("failed to create s3 object: %v", err) + } + firstKB := make([]byte, 1024) + n, err := nr.stream.Read(firstKB) + if err != nil { + t.Fatalf("failed to read from s3 object: %v", err) + } + if n != 1024 { + t.Fatalf("expected to read 1024 bytes, but read %d bytes", n) + } + t.Logf("returned contents %v\n", firstKB[:n]) + + }) + t.Run("txt read", func(t *testing.T) { + nr, err := NewStreamReader(s3TxtFile) + if err != nil { + t.Fatalf("failed to create s3 object: %v", err) + } + firstKB := make([]byte, 1024) + n, err := nr.stream.Read(firstKB) + if err != nil { + t.Fatalf("failed to read from s3 object: %v", err) + } + if n != 1024 { + t.Fatalf("expected to read 1024 bytes, but read %d bytes", n) + } + t.Logf("returned contents %s\n", firstKB[:n]) + + }) +} + +// (2) download entire file before reading +func TestS3Download(t *testing.T) { + t.Run("Download CSV locally", func(t *testing.T) { + nr, err := NewStreamReader(s3CSVFile) + if err != nil { + t.Fatalf("failed to create s3 object: %v", err) + } + newFile, err := nr.DownloadLocally() + if err != nil { + t.Fatalf("failed to download file locally %v", err) + } + defer func() { + _ = newFile.Close() + if err := os.Remove(newFile.Name()); err != nil { + t.Fatalf("error closing file %v", newFile.Name()) + } + }() + // validate stats about file + info, err := newFile.Stat() + if err != nil { + t.Fatalf("failed to get file stats %v", err) + } + if info.IsDir() { + t.Fatalf("expected regular file, found directory: %s", info.Name()) + } + + if info.Size() < 100 { + t.Fatalf("file is too small (%d bytes), expected >= 100 bytes", info.Size()) + } + + if !strings.HasPrefix(info.Name(), nr.fileName) { + t.Fatalf("filename mismatch: got %s, expected prefix %s", info.Name(), nr.fileName) + } + + }) + t.Run("Download parquet locally", func(t *testing.T) { + nr, err := NewStreamReader(s3ParquetFile) + if err != nil { + t.Fatalf("failed to create s3 object: %v", err) + } + newFile, err := nr.DownloadLocally() + if err != nil { + t.Fatalf("failed to download file locally %v", err) + } + defer func() { + _ = newFile.Close() + if err := os.Remove(newFile.Name()); err != nil { + t.Fatalf("error closing file %v", newFile.Name()) + } + }() + // validate stats about file + info, err := newFile.Stat() + if err != nil { + t.Fatalf("failed to get file stats %v", err) + } + if info.IsDir() { + t.Fatalf("expected regular file, found directory: %s", info.Name()) + } + + if info.Size() < 100 { + t.Fatalf("file is too small (%d bytes), expected >= 100 bytes", info.Size()) + } + + if !strings.HasPrefix(info.Name(), nr.fileName) { + t.Fatalf("filename mismatch: got %s, expected prefix %s", info.Name(), nr.fileName) + } + + }) + t.Run("Download txt locally", func(t *testing.T) { + nr, err := NewStreamReader(s3TxtFile) + if err != nil { + t.Fatalf("failed to create s3 object: %v", err) + } + newFile, err := nr.DownloadLocally() + if err != nil { + t.Fatalf("failed to download file locally %v", err) + } + defer func() { + _ = newFile.Close() + if err := os.Remove(newFile.Name()); err != nil { + t.Fatalf("error closing file %v", newFile.Name()) + } + }() + // validate stats about file + info, err := newFile.Stat() + if err != nil { + t.Fatalf("failed to get file stats %v", err) + } + if info.IsDir() { + t.Fatalf("expected regular file, found directory: %s", info.Name()) + } + + if info.Size() < 100 { + t.Fatalf("file is too small (%d bytes), expected >= 100 bytes", info.Size()) + } + + if !strings.HasPrefix(info.Name(), nr.fileName) { + t.Fatalf("filename mismatch: got %s, expected prefix %s", info.Name(), nr.fileName) + } + + }) +} + +// (3) add s3 variant of existing operaor sources (csv,parquet) and write minimal test here to check they work the same +func TestS3ForSource(t *testing.T) { + t.Run("csv from s3 source", func(t *testing.T) { + nr, err := NewStreamReader(s3CSVFile) + if err != nil { + t.Fatalf("failed to create s3 object: %v", err) + } + pj, err := NewProjectCSVLeaf(nr.stream) + if err != nil { + t.Fatalf("failed to create csv project source from s3 object: %v", err) + } + rc, err := pj.Next(5) + if err != nil { + t.Fatalf("failed to read record batch from s3 csv source: %v", err) + } + t.Logf("returned record batch from s3 csv source: %v\n", rc) + + }) + t.Run("parquet from s3 source", func(t *testing.T) { + nr, err := NewStreamReader(s3ParquetFile) + if err != nil { + t.Fatalf("failed to create s3 object: %v", err) + } + pj, err := NewParquetSource(nr) + if err != nil { + t.Fatalf("failed to create parquet project source from s3 object: %v", err) + } + rc, err := pj.Next(5) + if err != nil { + t.Fatalf("failed to read record batch from s3 csv source: %v", err) + } + t.Logf("returned record batch from s3 csv source: %v\n", rc) + + }) + t.Run("csv from s3 source then downloaded", func(t *testing.T) { + nr, err := NewStreamReader(s3CSVFile) + if err != nil { + t.Fatalf("failed to create s3 object: %v", err) + } + f, err := nr.DownloadLocally() + if err != nil { + t.Fatalf("failed to download s3 object locally: %v", err) + } + defer func() { + t.Log("deleting downloaded file...") + _ = f.Close() + if err := os.Remove(f.Name()); err != nil { + t.Fatalf("error closing file %v", f.Name()) + } + }() + pj, err := NewProjectCSVLeaf(f) + if err != nil { + t.Fatalf("failed to create csv project source from s3 object: %v", err) + } + rc, err := pj.Next(5) + if err != nil { + t.Fatalf("failed to read record batch from s3 csv source: %v", err) + } + err = pj.Close() + if err != nil { + t.Fatalf("failed to close csv project source: %v", err) + } + t.Logf("returned record batch from s3 csv source: %v\n", rc) + + }) + t.Run("parquet from s3 source then downloaded", func(t *testing.T) { + nr, err := NewStreamReader(s3ParquetFile) + if err != nil { + t.Fatalf("failed to create s3 object: %v", err) + } + f, err := nr.DownloadLocally() + if err != nil { + t.Fatalf("failed to download s3 object locally: %v", err) + } + defer func() { + t.Log("deleting downloaded file...") + _ = f.Close() + if err := os.Remove(f.Name()); err != nil { + t.Fatalf("error closing file %v", f.Name()) + } + }() + pj, err := NewParquetSource(f) + if err != nil { + t.Fatalf("failed to create csv project source from s3 object: %v", err) + } + rc, err := pj.Next(5) + if err != nil { + t.Fatalf("failed to read record batch from s3 csv source: %v", err) + } + t.Logf("returned record batch from s3 csv source: %v\n", rc) + + }) +} + +func TestS3Source(t *testing.T) { + nr, err := NewStreamReader(s3CSVFile) + if err != nil { + t.Fatalf("failed to create s3 object: %v", err) + } + t.Run("test SeekStart", func(t *testing.T) { + _, err := nr.Seek(0, io.SeekStart) + if err != nil { + t.Fatalf("failed to seek to start of s3 object: %v", err) + } + }) + t.Run("invalidSeek ", func(t *testing.T) { + _, err := nr.Seek(4, 4) + if err == nil { + t.Fatalf("expected error when seeking with invalid whence, but got none") + } + }) + t.Run("test stream read", func(t *testing.T) { + stream := nr.Stream() + buf := make([]byte, 512) + n, err := stream.Read(buf) + if err != nil { + t.Fatalf("failed to read from s3 object stream: %v", err) + } + if n == 0 { + t.Fatalf("expected to read some bytes from s3 object stream, but read 0 bytes") + } + t.Logf("read %d bytes from s3 object stream: %s\n", n, string(buf[:n])) + }) +} diff --git a/src/Backend/opti-sql-go/operators/record.go b/src/Backend/opti-sql-go/operators/record.go index 70a8a2e..ba2c37d 100644 --- a/src/Backend/opti-sql-go/operators/record.go +++ b/src/Backend/opti-sql-go/operators/record.go @@ -15,9 +15,16 @@ var ( } ) +type Operator interface { + Next(uint16) (*RecordBatch, error) + Schema() *arrow.Schema + // Call Operator.Close() after Next returns an io.EOF to clean up resources + Close() error +} type RecordBatch struct { - Schema *arrow.Schema - Columns []arrow.Array + Schema *arrow.Schema + Columns []arrow.Array + RowCount uint64 // TODO: update to actually use this, in all operators } type SchemaBuilder struct { diff --git a/src/Backend/opti-sql-go/operators/serialize_test.go b/src/Backend/opti-sql-go/operators/serialize_test.go index b8b3cac..f7d19e3 100644 --- a/src/Backend/opti-sql-go/operators/serialize_test.go +++ b/src/Backend/opti-sql-go/operators/serialize_test.go @@ -136,7 +136,7 @@ func TestSerializerInit(t *testing.T) { // TestSchemaOnlySerialization tests standalone schema serialization/deserialization func TestSchemaOnlySerialization(t *testing.T) { recordBatch := generateDummyRecordBatch1() - fmt.Printf("original schema before serialization: %v\n", recordBatch.Schema) + t.Logf("original schema before serialization: %v\n", recordBatch.Schema) ss, err := NewSerializer(recordBatch.Schema) if err != nil { @@ -148,7 +148,7 @@ func TestSchemaOnlySerialization(t *testing.T) { if err != nil { t.Fatalf("Schema serialization failed: %v", err) } - fmt.Printf("serialized schema bytes length: %d\n", len(serializedSchema)) + t.Logf("serialized schema bytes length: %d\n", len(serializedSchema)) // Deserialize schema deserializedSchema, err := ss.schemaFromDisk(bytes.NewBuffer(serializedSchema)) @@ -160,7 +160,7 @@ func TestSchemaOnlySerialization(t *testing.T) { if !deserializedSchema.Equal(recordBatch.Schema) { t.Fatal("Deserialized schema does not match the original schema") } - fmt.Printf("schema after serialization & deserialization: %v\n", deserializedSchema) + t.Logf("schema after serialization & deserialization: %v\n", deserializedSchema) // Validate field properties for i := 0; i < recordBatch.Schema.NumFields(); i++ { @@ -745,7 +745,7 @@ func TestSerializationWithDifferentTypes(t *testing.T) { func TestNullSchemaSerialize(t *testing.T) { rb := generateNullableRecordBatch() for i := range rb.Schema.Fields() { - fmt.Printf("is nullable? : %v\n", rb.Schema.Field(i).Nullable) + t.Logf("is nullable? : %v\n", rb.Schema.Field(i).Nullable) } serializer, err := NewSerializer(rb.Schema) if err != nil { diff --git a/src/Backend/test_data/csv/Mental_Health_and_Social_Media_Balance_Dataset.csv b/src/Backend/test_data/csv/Mental_Health_and_Social_Media_Balance_Dataset.csv new file mode 100644 index 0000000..7457ea2 --- /dev/null +++ b/src/Backend/test_data/csv/Mental_Health_and_Social_Media_Balance_Dataset.csv @@ -0,0 +1,501 @@ +User_ID,Age,Gender,Daily_Screen_Time(hrs),Sleep_Quality(1-10),Stress_Level(1-10),Days_Without_Social_Media,Exercise_Frequency(week),Social_Media_Platform,Happiness_Index(1-10) +U001,44,Male,3.1,7.0,6.0,2.0,5.0,Facebook,10.0 +U002,30,Other,5.1,7.0,8.0,5.0,3.0,LinkedIn,10.0 +U003,23,Other,7.4,6.0,7.0,1.0,3.0,YouTube,6.0 +U004,36,Female,5.7,7.0,8.0,1.0,1.0,TikTok,8.0 +U005,34,Female,7.0,4.0,7.0,5.0,1.0,X (Twitter),8.0 +U006,38,Male,6.6,5.0,7.0,4.0,3.0,LinkedIn,8.0 +U007,26,Female,7.8,4.0,8.0,2.0,0.0,TikTok,7.0 +U008,26,Female,7.4,5.0,6.0,1.0,4.0,Instagram,7.0 +U009,39,Male,4.7,7.0,7.0,6.0,1.0,YouTube,9.0 +U010,39,Female,6.6,6.0,8.0,0.0,2.0,Facebook,7.0 +U011,18,Female,2.8,7.0,6.0,2.0,0.0,Instagram,7.0 +U012,37,Other,5.4,5.0,7.0,3.0,2.0,Instagram,9.0 +U013,17,Female,7.0,7.0,10.0,7.0,1.0,YouTube,8.0 +U014,39,Female,5.7,5.0,7.0,4.0,0.0,Facebook,8.0 +U015,45,Male,6.3,7.0,7.0,4.0,3.0,X (Twitter),9.0 +U016,17,Female,5.1,7.0,6.0,2.0,5.0,LinkedIn,10.0 +U017,36,Female,7.5,5.0,8.0,4.0,4.0,Facebook,7.0 +U018,48,Male,5.4,6.0,4.0,3.0,4.0,TikTok,10.0 +U019,27,Male,4.7,6.0,6.0,0.0,2.0,Instagram,9.0 +U020,37,Male,7.1,6.0,6.0,5.0,4.0,TikTok,10.0 +U021,40,Female,3.0,8.0,5.0,5.0,4.0,X (Twitter),10.0 +U022,42,Other,6.4,7.0,6.0,5.0,3.0,TikTok,10.0 +U023,43,Male,3.6,8.0,5.0,3.0,1.0,Facebook,9.0 +U024,31,Female,7.8,5.0,8.0,1.0,2.0,TikTok,7.0 +U025,30,Male,5.0,6.0,6.0,0.0,5.0,X (Twitter),8.0 +U026,18,Other,5.7,6.0,7.0,3.0,2.0,X (Twitter),8.0 +U027,22,Female,6.2,5.0,8.0,5.0,0.0,X (Twitter),6.0 +U028,36,Female,2.0,8.0,4.0,5.0,0.0,Facebook,10.0 +U029,24,Male,4.3,6.0,6.0,5.0,2.0,X (Twitter),10.0 +U030,33,Male,7.4,6.0,10.0,3.0,4.0,Instagram,8.0 +U031,19,Male,7.1,5.0,8.0,5.0,2.0,X (Twitter),9.0 +U032,40,Male,6.0,6.0,7.0,5.0,4.0,TikTok,7.0 +U033,29,Female,3.1,8.0,4.0,2.0,2.0,LinkedIn,10.0 +U034,24,Female,6.6,6.0,7.0,4.0,0.0,TikTok,7.0 +U035,41,Other,6.8,8.0,7.0,2.0,2.0,LinkedIn,9.0 +U036,17,Other,5.6,6.0,7.0,4.0,3.0,TikTok,7.0 +U037,35,Female,7.7,4.0,8.0,0.0,3.0,Facebook,8.0 +U038,43,Female,4.3,8.0,4.0,1.0,1.0,TikTok,10.0 +U039,22,Other,4.6,4.0,6.0,5.0,2.0,Facebook,9.0 +U040,23,Female,1.0,9.0,5.0,5.0,7.0,Facebook,10.0 +U041,29,Male,7.2,5.0,7.0,4.0,1.0,X (Twitter),10.0 +U042,32,Male,6.1,5.0,6.0,3.0,3.0,Instagram,7.0 +U043,19,Male,2.5,9.0,3.0,0.0,0.0,LinkedIn,10.0 +U044,17,Other,2.3,9.0,3.0,3.0,4.0,X (Twitter),10.0 +U045,21,Female,3.9,6.0,6.0,4.0,1.0,X (Twitter),8.0 +U046,19,Male,6.9,5.0,7.0,2.0,3.0,Facebook,9.0 +U047,44,Female,5.0,4.0,7.0,2.0,1.0,Facebook,8.0 +U048,33,Female,4.1,8.0,7.0,6.0,3.0,LinkedIn,9.0 +U049,41,Male,3.7,8.0,6.0,3.0,1.0,TikTok,9.0 +U050,49,Male,3.6,7.0,5.0,4.0,1.0,TikTok,9.0 +U051,25,Female,2.0,9.0,4.0,5.0,3.0,X (Twitter),10.0 +U052,29,Female,3.3,7.0,5.0,1.0,2.0,YouTube,10.0 +U053,46,Female,3.6,7.0,5.0,3.0,3.0,YouTube,10.0 +U054,30,Female,6.7,6.0,8.0,6.0,2.0,TikTok,7.0 +U055,23,Female,5.2,6.0,6.0,7.0,3.0,YouTube,8.0 +U056,29,Female,4.0,7.0,3.0,3.0,3.0,TikTok,10.0 +U057,38,Female,9.7,3.0,9.0,3.0,2.0,Facebook,4.0 +U058,36,Female,4.3,7.0,6.0,4.0,1.0,X (Twitter),9.0 +U059,31,Male,7.6,5.0,10.0,3.0,2.0,Instagram,6.0 +U060,33,Male,8.4,4.0,8.0,6.0,4.0,X (Twitter),7.0 +U061,39,Male,2.7,9.0,5.0,1.0,3.0,TikTok,10.0 +U062,41,Female,5.6,6.0,7.0,2.0,1.0,TikTok,9.0 +U063,40,Female,4.6,7.0,7.0,1.0,3.0,X (Twitter),10.0 +U064,44,Female,4.4,6.0,7.0,1.0,3.0,LinkedIn,9.0 +U065,30,Male,6.5,4.0,9.0,4.0,3.0,TikTok,5.0 +U066,16,Male,4.4,8.0,4.0,5.0,1.0,YouTube,10.0 +U067,40,Male,4.6,7.0,6.0,2.0,1.0,YouTube,10.0 +U068,22,Male,5.2,9.0,7.0,5.0,1.0,TikTok,10.0 +U069,24,Female,4.8,6.0,7.0,5.0,2.0,TikTok,7.0 +U070,39,Male,3.4,8.0,6.0,7.0,3.0,LinkedIn,10.0 +U071,16,Male,6.9,5.0,8.0,3.0,3.0,YouTube,5.0 +U072,23,Female,7.4,6.0,7.0,2.0,3.0,TikTok,8.0 +U073,39,Male,6.1,5.0,5.0,1.0,5.0,TikTok,10.0 +U074,26,Female,4.8,6.0,6.0,1.0,4.0,LinkedIn,8.0 +U075,32,Female,4.2,8.0,8.0,5.0,2.0,LinkedIn,8.0 +U076,23,Male,4.2,7.0,5.0,0.0,1.0,X (Twitter),10.0 +U077,48,Female,9.7,4.0,8.0,5.0,1.0,LinkedIn,5.0 +U078,20,Female,7.2,6.0,9.0,1.0,2.0,Instagram,6.0 +U079,43,Other,4.0,7.0,7.0,2.0,4.0,YouTube,9.0 +U080,22,Male,5.8,6.0,8.0,3.0,2.0,X (Twitter),7.0 +U081,24,Female,4.8,6.0,7.0,2.0,2.0,Instagram,8.0 +U082,23,Female,6.6,6.0,8.0,5.0,1.0,TikTok,8.0 +U083,27,Male,6.5,7.0,9.0,7.0,2.0,TikTok,8.0 +U084,49,Male,7.5,4.0,7.0,2.0,4.0,X (Twitter),8.0 +U085,48,Other,4.0,7.0,5.0,2.0,5.0,LinkedIn,10.0 +U086,38,Female,5.7,5.0,6.0,3.0,2.0,TikTok,9.0 +U087,39,Male,2.4,7.0,4.0,4.0,2.0,Facebook,10.0 +U088,37,Female,3.2,8.0,7.0,3.0,1.0,X (Twitter),10.0 +U089,42,Male,7.1,3.0,8.0,6.0,5.0,Instagram,6.0 +U090,16,Male,3.8,10.0,6.0,5.0,0.0,X (Twitter),10.0 +U091,29,Male,4.0,6.0,4.0,1.0,4.0,YouTube,9.0 +U092,18,Male,5.5,6.0,6.0,4.0,2.0,LinkedIn,9.0 +U093,16,Male,6.8,7.0,6.0,6.0,2.0,LinkedIn,9.0 +U094,20,Male,3.6,7.0,7.0,2.0,1.0,X (Twitter),8.0 +U095,41,Female,1.0,7.0,5.0,2.0,6.0,TikTok,10.0 +U096,29,Male,3.9,7.0,6.0,2.0,3.0,TikTok,10.0 +U097,42,Male,7.7,4.0,7.0,2.0,3.0,Facebook,8.0 +U098,24,Female,6.1,4.0,9.0,0.0,4.0,Facebook,6.0 +U099,30,Female,7.9,5.0,8.0,0.0,2.0,LinkedIn,8.0 +U100,30,Female,3.5,8.0,4.0,3.0,3.0,Facebook,10.0 +U101,41,Male,2.5,9.0,4.0,6.0,3.0,Instagram,10.0 +U102,28,Male,6.3,5.0,7.0,2.0,2.0,Facebook,7.0 +U103,47,Male,5.9,4.0,7.0,2.0,4.0,Instagram,7.0 +U104,47,Male,6.0,6.0,7.0,6.0,2.0,Facebook,9.0 +U105,19,Male,7.2,5.0,6.0,1.0,0.0,LinkedIn,9.0 +U106,45,Male,6.3,7.0,8.0,0.0,2.0,LinkedIn,6.0 +U107,38,Male,6.2,7.0,7.0,1.0,3.0,Instagram,9.0 +U108,30,Male,6.8,7.0,7.0,2.0,1.0,X (Twitter),10.0 +U109,44,Male,6.4,6.0,8.0,0.0,1.0,LinkedIn,7.0 +U110,28,Male,7.0,6.0,6.0,3.0,2.0,X (Twitter),8.0 +U111,47,Male,7.3,4.0,6.0,5.0,4.0,TikTok,8.0 +U112,22,Female,8.3,5.0,10.0,0.0,2.0,Instagram,6.0 +U113,37,Female,4.3,6.0,8.0,3.0,5.0,LinkedIn,9.0 +U114,43,Male,6.4,6.0,7.0,4.0,2.0,YouTube,7.0 +U115,17,Female,4.8,7.0,6.0,3.0,3.0,TikTok,10.0 +U116,21,Female,4.0,9.0,5.0,3.0,2.0,X (Twitter),10.0 +U117,43,Male,4.3,7.0,6.0,3.0,4.0,LinkedIn,9.0 +U118,43,Female,6.4,6.0,7.0,1.0,1.0,Instagram,9.0 +U119,35,Female,6.3,6.0,7.0,6.0,4.0,Facebook,9.0 +U120,45,Male,6.2,5.0,8.0,5.0,3.0,YouTube,8.0 +U121,26,Male,5.1,6.0,5.0,5.0,2.0,X (Twitter),9.0 +U122,43,Female,5.5,8.0,6.0,5.0,2.0,TikTok,10.0 +U123,40,Female,7.0,4.0,6.0,0.0,5.0,Instagram,8.0 +U124,48,Male,2.8,7.0,5.0,5.0,3.0,YouTube,10.0 +U125,16,Male,5.5,7.0,7.0,4.0,1.0,Instagram,9.0 +U126,42,Male,5.7,7.0,7.0,0.0,1.0,LinkedIn,9.0 +U127,28,Female,6.9,5.0,8.0,3.0,3.0,Facebook,8.0 +U128,18,Female,3.1,7.0,6.0,4.0,3.0,TikTok,9.0 +U129,21,Male,8.2,5.0,7.0,6.0,4.0,Instagram,7.0 +U130,23,Male,5.9,5.0,5.0,4.0,0.0,X (Twitter),9.0 +U131,42,Female,6.0,5.0,8.0,1.0,1.0,LinkedIn,7.0 +U132,24,Male,4.8,8.0,4.0,6.0,4.0,LinkedIn,10.0 +U133,48,Male,6.5,6.0,8.0,3.0,3.0,TikTok,8.0 +U134,39,Female,5.6,8.0,6.0,2.0,3.0,LinkedIn,9.0 +U135,30,Female,7.7,4.0,10.0,2.0,1.0,X (Twitter),8.0 +U136,47,Female,2.9,8.0,5.0,0.0,1.0,Instagram,10.0 +U137,47,Female,5.8,6.0,7.0,3.0,4.0,TikTok,10.0 +U138,39,Male,6.2,5.0,8.0,4.0,4.0,LinkedIn,7.0 +U139,27,Male,5.2,7.0,5.0,2.0,1.0,LinkedIn,10.0 +U140,17,Female,4.8,8.0,5.0,4.0,1.0,X (Twitter),10.0 +U141,18,Male,4.8,6.0,5.0,1.0,2.0,Facebook,10.0 +U142,32,Male,5.8,6.0,6.0,3.0,1.0,Facebook,10.0 +U143,17,Male,2.2,10.0,5.0,4.0,1.0,LinkedIn,10.0 +U144,17,Male,7.2,7.0,8.0,3.0,1.0,Instagram,8.0 +U145,43,Male,4.6,8.0,7.0,0.0,2.0,TikTok,9.0 +U146,38,Male,5.7,7.0,5.0,2.0,1.0,LinkedIn,10.0 +U147,47,Male,6.2,5.0,7.0,6.0,1.0,YouTube,9.0 +U148,48,Male,4.6,7.0,5.0,2.0,3.0,TikTok,10.0 +U149,16,Female,5.3,7.0,6.0,3.0,2.0,Facebook,10.0 +U150,34,Female,1.8,9.0,7.0,3.0,3.0,LinkedIn,10.0 +U151,17,Female,6.8,5.0,8.0,6.0,0.0,X (Twitter),6.0 +U152,41,Female,7.4,4.0,9.0,6.0,3.0,YouTube,5.0 +U153,47,Female,2.4,7.0,7.0,5.0,6.0,LinkedIn,9.0 +U154,21,Male,4.8,5.0,7.0,1.0,2.0,Instagram,8.0 +U155,47,Female,3.3,8.0,5.0,4.0,4.0,YouTube,10.0 +U156,19,Female,5.2,7.0,5.0,4.0,4.0,LinkedIn,10.0 +U157,26,Female,6.7,5.0,7.0,2.0,4.0,Instagram,8.0 +U158,32,Male,7.3,5.0,7.0,1.0,3.0,YouTube,8.0 +U159,39,Male,7.5,5.0,6.0,6.0,3.0,Instagram,7.0 +U160,20,Male,7.9,5.0,9.0,2.0,4.0,Instagram,5.0 +U161,49,Male,5.1,5.0,7.0,6.0,3.0,YouTube,8.0 +U162,21,Female,3.4,9.0,7.0,5.0,1.0,YouTube,8.0 +U163,37,Female,5.3,7.0,6.0,4.0,0.0,Instagram,10.0 +U164,26,Male,6.6,6.0,8.0,4.0,0.0,Facebook,8.0 +U165,31,Other,3.3,9.0,5.0,0.0,2.0,YouTube,10.0 +U166,48,Female,8.5,4.0,10.0,3.0,3.0,LinkedIn,8.0 +U167,24,Male,7.1,6.0,8.0,6.0,1.0,TikTok,7.0 +U168,21,Male,5.1,6.0,5.0,4.0,4.0,X (Twitter),10.0 +U169,31,Male,4.2,8.0,6.0,3.0,2.0,Instagram,10.0 +U170,44,Male,4.3,8.0,6.0,6.0,2.0,Instagram,10.0 +U171,18,Male,5.6,5.0,7.0,3.0,1.0,X (Twitter),6.0 +U172,35,Male,5.1,7.0,7.0,2.0,0.0,Facebook,7.0 +U173,34,Male,7.2,6.0,6.0,3.0,3.0,X (Twitter),8.0 +U174,41,Male,4.9,7.0,5.0,5.0,5.0,YouTube,10.0 +U175,18,Female,6.4,6.0,8.0,0.0,2.0,X (Twitter),8.0 +U176,34,Male,5.4,6.0,6.0,4.0,0.0,Instagram,10.0 +U177,35,Female,6.0,7.0,7.0,5.0,4.0,X (Twitter),8.0 +U178,47,Male,5.1,7.0,4.0,2.0,0.0,TikTok,9.0 +U179,22,Other,6.8,5.0,10.0,5.0,1.0,LinkedIn,7.0 +U180,48,Male,6.8,4.0,7.0,4.0,2.0,YouTube,7.0 +U181,33,Male,6.3,6.0,8.0,7.0,3.0,X (Twitter),6.0 +U182,16,Other,6.6,6.0,7.0,1.0,1.0,Instagram,8.0 +U183,26,Female,3.8,10.0,6.0,5.0,2.0,LinkedIn,10.0 +U184,43,Female,6.1,7.0,9.0,1.0,4.0,YouTube,7.0 +U185,40,Male,7.7,3.0,9.0,0.0,4.0,Instagram,5.0 +U186,38,Male,6.4,6.0,7.0,0.0,4.0,Facebook,8.0 +U187,46,Female,3.6,9.0,5.0,4.0,5.0,TikTok,10.0 +U188,45,Female,5.4,7.0,8.0,3.0,4.0,Facebook,7.0 +U189,22,Female,6.2,6.0,9.0,2.0,2.0,TikTok,6.0 +U190,31,Female,4.1,8.0,6.0,0.0,4.0,X (Twitter),10.0 +U191,41,Male,3.1,7.0,7.0,2.0,3.0,YouTube,9.0 +U192,17,Female,4.9,7.0,6.0,3.0,3.0,X (Twitter),10.0 +U193,16,Male,4.2,7.0,7.0,5.0,3.0,TikTok,8.0 +U194,27,Male,5.5,6.0,8.0,3.0,1.0,TikTok,7.0 +U195,20,Male,2.6,9.0,4.0,4.0,1.0,TikTok,10.0 +U196,47,Female,6.5,5.0,7.0,2.0,2.0,LinkedIn,8.0 +U197,24,Male,4.4,8.0,7.0,6.0,3.0,Instagram,10.0 +U198,34,Male,7.3,5.0,7.0,1.0,1.0,YouTube,6.0 +U199,31,Female,5.0,7.0,5.0,2.0,3.0,X (Twitter),10.0 +U200,18,Male,5.5,5.0,6.0,2.0,6.0,X (Twitter),9.0 +U201,35,Female,5.2,7.0,6.0,4.0,0.0,Facebook,10.0 +U202,39,Male,4.3,9.0,7.0,1.0,3.0,TikTok,9.0 +U203,48,Female,10.0,3.0,10.0,3.0,2.0,TikTok,4.0 +U204,39,Female,4.6,6.0,7.0,3.0,3.0,TikTok,6.0 +U205,26,Male,7.3,5.0,8.0,2.0,2.0,X (Twitter),7.0 +U206,23,Male,3.5,9.0,5.0,4.0,2.0,Facebook,10.0 +U207,35,Male,6.2,4.0,8.0,1.0,3.0,Instagram,7.0 +U208,40,Male,6.4,5.0,8.0,1.0,2.0,LinkedIn,7.0 +U209,40,Other,5.5,6.0,7.0,3.0,0.0,YouTube,9.0 +U210,44,Male,4.5,6.0,6.0,0.0,2.0,Instagram,9.0 +U211,33,Female,5.0,6.0,6.0,0.0,0.0,X (Twitter),8.0 +U212,33,Male,4.8,5.0,6.0,4.0,3.0,LinkedIn,9.0 +U213,17,Female,9.8,4.0,9.0,5.0,0.0,LinkedIn,6.0 +U214,31,Female,5.5,7.0,7.0,2.0,4.0,LinkedIn,9.0 +U215,48,Female,6.4,4.0,6.0,2.0,3.0,TikTok,7.0 +U216,19,Female,8.7,6.0,7.0,5.0,3.0,YouTube,10.0 +U217,48,Male,3.9,9.0,5.0,1.0,0.0,X (Twitter),10.0 +U218,29,Male,5.6,6.0,6.0,0.0,2.0,X (Twitter),8.0 +U219,36,Female,5.0,8.0,6.0,4.0,4.0,LinkedIn,10.0 +U220,35,Female,5.3,6.0,6.0,3.0,2.0,X (Twitter),9.0 +U221,23,Other,5.2,7.0,6.0,5.0,1.0,YouTube,9.0 +U222,22,Male,4.2,6.0,6.0,0.0,2.0,YouTube,8.0 +U223,18,Female,5.9,8.0,6.0,0.0,2.0,Facebook,8.0 +U224,32,Female,3.6,7.0,5.0,3.0,1.0,Facebook,10.0 +U225,48,Male,7.3,5.0,9.0,4.0,4.0,Instagram,5.0 +U226,27,Male,6.6,5.0,7.0,5.0,2.0,YouTube,7.0 +U227,37,Female,4.6,9.0,6.0,3.0,2.0,YouTube,8.0 +U228,37,Female,4.7,9.0,6.0,3.0,1.0,YouTube,10.0 +U229,45,Female,6.0,8.0,7.0,2.0,3.0,Facebook,9.0 +U230,23,Male,4.1,8.0,5.0,6.0,3.0,LinkedIn,10.0 +U231,42,Female,8.8,4.0,8.0,1.0,1.0,LinkedIn,6.0 +U232,42,Male,6.1,7.0,6.0,4.0,0.0,X (Twitter),8.0 +U233,49,Male,6.2,6.0,8.0,1.0,3.0,LinkedIn,8.0 +U234,36,Male,6.7,7.0,7.0,4.0,2.0,TikTok,8.0 +U235,45,Female,4.5,8.0,6.0,1.0,3.0,Facebook,10.0 +U236,48,Female,7.5,4.0,9.0,4.0,5.0,YouTube,5.0 +U237,43,Female,5.7,7.0,7.0,3.0,2.0,LinkedIn,9.0 +U238,48,Male,7.2,6.0,8.0,6.0,2.0,Facebook,9.0 +U239,20,Female,5.1,4.0,5.0,2.0,1.0,LinkedIn,9.0 +U240,34,Male,7.9,6.0,8.0,4.0,2.0,Instagram,7.0 +U241,19,Female,6.2,5.0,7.0,1.0,3.0,YouTube,8.0 +U242,32,Male,9.4,3.0,9.0,2.0,3.0,Instagram,5.0 +U243,43,Male,4.8,6.0,5.0,3.0,5.0,YouTube,10.0 +U244,45,Male,5.9,6.0,8.0,0.0,3.0,Facebook,7.0 +U245,44,Male,3.6,8.0,5.0,0.0,2.0,X (Twitter),10.0 +U246,21,Female,4.5,6.0,6.0,4.0,1.0,LinkedIn,9.0 +U247,39,Female,6.6,6.0,7.0,3.0,3.0,LinkedIn,7.0 +U248,44,Male,7.5,6.0,9.0,2.0,1.0,TikTok,6.0 +U249,46,Female,10.8,5.0,10.0,2.0,3.0,Instagram,4.0 +U250,48,Male,5.3,5.0,5.0,2.0,3.0,LinkedIn,8.0 +U251,36,Female,3.8,7.0,5.0,3.0,2.0,Facebook,9.0 +U252,47,Female,6.9,6.0,8.0,4.0,2.0,X (Twitter),8.0 +U253,38,Female,3.5,9.0,4.0,0.0,0.0,Facebook,10.0 +U254,48,Male,6.5,5.0,8.0,6.0,4.0,Instagram,8.0 +U255,18,Female,4.6,8.0,7.0,6.0,3.0,TikTok,9.0 +U256,33,Female,5.9,4.0,7.0,0.0,3.0,LinkedIn,5.0 +U257,40,Male,7.5,4.0,8.0,5.0,0.0,Facebook,7.0 +U258,46,Male,5.3,7.0,8.0,5.0,5.0,YouTube,9.0 +U259,18,Female,7.2,4.0,7.0,3.0,2.0,X (Twitter),6.0 +U260,39,Female,7.1,4.0,8.0,0.0,2.0,Facebook,7.0 +U261,47,Female,2.1,8.0,4.0,5.0,1.0,X (Twitter),10.0 +U262,37,Female,3.9,8.0,6.0,3.0,0.0,LinkedIn,10.0 +U263,38,Female,2.7,6.0,3.0,4.0,4.0,YouTube,10.0 +U264,17,Male,9.1,4.0,9.0,7.0,4.0,TikTok,4.0 +U265,42,Male,5.3,7.0,7.0,4.0,2.0,TikTok,8.0 +U266,17,Male,8.7,4.0,8.0,1.0,1.0,YouTube,7.0 +U267,41,Female,2.6,8.0,4.0,2.0,2.0,X (Twitter),10.0 +U268,32,Male,4.5,7.0,6.0,1.0,2.0,LinkedIn,8.0 +U269,48,Female,6.9,5.0,9.0,4.0,1.0,Facebook,6.0 +U270,24,Female,5.8,8.0,6.0,1.0,2.0,YouTube,10.0 +U271,44,Female,6.5,5.0,6.0,2.0,3.0,Instagram,8.0 +U272,41,Male,3.2,8.0,6.0,0.0,4.0,X (Twitter),10.0 +U273,40,Female,8.0,5.0,8.0,3.0,3.0,TikTok,6.0 +U274,39,Male,6.9,5.0,7.0,3.0,0.0,YouTube,9.0 +U275,28,Female,6.2,3.0,8.0,0.0,3.0,TikTok,5.0 +U276,22,Male,4.3,7.0,5.0,4.0,2.0,X (Twitter),10.0 +U277,35,Male,6.1,7.0,8.0,3.0,5.0,TikTok,9.0 +U278,16,Female,5.0,7.0,6.0,1.0,5.0,YouTube,10.0 +U279,23,Male,2.5,9.0,5.0,7.0,5.0,X (Twitter),10.0 +U280,31,Male,5.4,5.0,8.0,5.0,2.0,Instagram,7.0 +U281,29,Female,5.2,7.0,6.0,1.0,5.0,YouTube,10.0 +U282,27,Female,7.4,4.0,9.0,5.0,3.0,Facebook,5.0 +U283,38,Female,1.5,9.0,4.0,5.0,5.0,LinkedIn,10.0 +U284,30,Female,5.9,6.0,9.0,3.0,2.0,YouTube,7.0 +U285,43,Female,3.4,7.0,7.0,4.0,2.0,X (Twitter),10.0 +U286,49,Male,6.1,5.0,6.0,4.0,3.0,LinkedIn,9.0 +U287,17,Male,6.2,6.0,8.0,0.0,3.0,YouTube,7.0 +U288,47,Male,5.3,6.0,6.0,3.0,4.0,Instagram,9.0 +U289,38,Female,7.0,4.0,8.0,3.0,2.0,Instagram,6.0 +U290,37,Female,3.3,7.0,5.0,6.0,4.0,Facebook,9.0 +U291,40,Male,7.5,5.0,7.0,4.0,4.0,TikTok,9.0 +U292,37,Male,4.2,9.0,7.0,2.0,2.0,LinkedIn,8.0 +U293,37,Other,2.7,8.0,5.0,3.0,3.0,X (Twitter),10.0 +U294,21,Male,5.0,7.0,6.0,5.0,2.0,X (Twitter),10.0 +U295,30,Female,1.7,9.0,4.0,3.0,6.0,TikTok,10.0 +U296,48,Female,6.0,5.0,7.0,2.0,0.0,Instagram,7.0 +U297,23,Female,5.0,5.0,7.0,2.0,4.0,Instagram,8.0 +U298,20,Male,7.7,4.0,8.0,5.0,0.0,YouTube,5.0 +U299,19,Female,6.7,6.0,5.0,4.0,2.0,Facebook,9.0 +U300,21,Female,3.0,8.0,5.0,3.0,2.0,LinkedIn,10.0 +U301,47,Female,4.0,8.0,4.0,3.0,5.0,Facebook,10.0 +U302,45,Female,6.4,6.0,7.0,2.0,1.0,Facebook,7.0 +U303,31,Female,7.0,5.0,6.0,3.0,4.0,YouTube,8.0 +U304,28,Female,7.7,4.0,7.0,2.0,1.0,Facebook,7.0 +U305,45,Male,7.7,6.0,9.0,2.0,5.0,TikTok,6.0 +U306,34,Female,5.3,6.0,6.0,6.0,2.0,YouTube,10.0 +U307,32,Female,6.2,5.0,7.0,1.0,3.0,TikTok,7.0 +U308,34,Male,7.8,5.0,9.0,2.0,2.0,YouTube,6.0 +U309,43,Female,5.0,5.0,6.0,0.0,4.0,Facebook,8.0 +U310,41,Female,6.2,8.0,9.0,0.0,4.0,TikTok,7.0 +U311,41,Female,4.9,7.0,6.0,3.0,3.0,TikTok,9.0 +U312,38,Male,4.5,7.0,6.0,2.0,1.0,TikTok,9.0 +U313,24,Male,5.6,5.0,8.0,7.0,1.0,YouTube,7.0 +U314,27,Other,4.3,7.0,7.0,2.0,1.0,X (Twitter),8.0 +U315,16,Female,5.3,6.0,6.0,6.0,0.0,Facebook,9.0 +U316,16,Male,3.6,7.0,5.0,5.0,4.0,TikTok,10.0 +U317,49,Female,5.2,7.0,8.0,3.0,2.0,LinkedIn,8.0 +U318,47,Female,6.3,5.0,8.0,7.0,2.0,X (Twitter),8.0 +U319,40,Female,4.4,8.0,7.0,4.0,5.0,LinkedIn,8.0 +U320,16,Female,5.5,9.0,5.0,6.0,2.0,Facebook,10.0 +U321,31,Male,5.8,6.0,8.0,6.0,1.0,Facebook,7.0 +U322,20,Male,6.3,6.0,6.0,4.0,2.0,TikTok,8.0 +U323,37,Male,5.3,7.0,7.0,5.0,1.0,TikTok,8.0 +U324,44,Female,7.5,7.0,8.0,4.0,3.0,YouTube,9.0 +U325,18,Male,1.7,9.0,3.0,1.0,2.0,X (Twitter),10.0 +U326,27,Male,10.8,2.0,9.0,3.0,2.0,X (Twitter),5.0 +U327,41,Male,3.9,6.0,4.0,5.0,2.0,TikTok,9.0 +U328,31,Female,6.2,6.0,7.0,6.0,2.0,Facebook,8.0 +U329,37,Female,3.8,7.0,4.0,2.0,3.0,TikTok,10.0 +U330,44,Male,6.7,8.0,6.0,4.0,2.0,Facebook,10.0 +U331,29,Male,6.0,8.0,7.0,3.0,2.0,Facebook,7.0 +U332,43,Female,4.3,6.0,3.0,3.0,2.0,TikTok,10.0 +U333,20,Male,7.4,4.0,9.0,4.0,2.0,Instagram,6.0 +U334,45,Male,5.2,7.0,4.0,3.0,3.0,X (Twitter),10.0 +U335,20,Male,6.5,7.0,7.0,4.0,1.0,LinkedIn,7.0 +U336,27,Male,3.8,8.0,6.0,6.0,4.0,Facebook,10.0 +U337,31,Male,5.4,6.0,7.0,5.0,3.0,Facebook,8.0 +U338,41,Female,9.1,4.0,9.0,4.0,2.0,Instagram,7.0 +U339,41,Male,7.2,5.0,10.0,7.0,3.0,LinkedIn,7.0 +U340,36,Female,6.1,7.0,7.0,5.0,2.0,YouTube,9.0 +U341,48,Male,4.7,6.0,7.0,3.0,0.0,Instagram,8.0 +U342,45,Female,5.6,6.0,6.0,1.0,3.0,LinkedIn,7.0 +U343,38,Female,5.9,7.0,7.0,1.0,1.0,Instagram,8.0 +U344,25,Male,6.3,5.0,9.0,5.0,4.0,Facebook,8.0 +U345,20,Female,3.6,6.0,5.0,2.0,2.0,X (Twitter),10.0 +U346,49,Male,6.7,6.0,8.0,1.0,4.0,Instagram,8.0 +U347,46,Other,3.5,7.0,6.0,1.0,2.0,YouTube,10.0 +U348,25,Male,8.4,5.0,8.0,2.0,3.0,TikTok,8.0 +U349,34,Male,4.1,8.0,4.0,5.0,4.0,LinkedIn,10.0 +U350,47,Male,6.0,6.0,7.0,3.0,2.0,Instagram,8.0 +U351,16,Other,7.3,5.0,9.0,0.0,2.0,Instagram,8.0 +U352,20,Female,5.0,6.0,5.0,3.0,3.0,YouTube,9.0 +U353,19,Female,6.1,6.0,8.0,3.0,3.0,X (Twitter),9.0 +U354,31,Male,8.1,5.0,9.0,2.0,2.0,Instagram,6.0 +U355,39,Female,6.5,5.0,5.0,5.0,0.0,Instagram,10.0 +U356,31,Male,6.1,6.0,6.0,2.0,2.0,X (Twitter),8.0 +U357,17,Female,2.5,9.0,4.0,5.0,4.0,YouTube,10.0 +U358,43,Male,3.8,8.0,5.0,4.0,1.0,LinkedIn,10.0 +U359,47,Male,7.4,6.0,7.0,5.0,2.0,LinkedIn,7.0 +U360,42,Male,6.5,5.0,7.0,1.0,2.0,Instagram,8.0 +U361,35,Female,6.8,4.0,9.0,3.0,0.0,X (Twitter),7.0 +U362,39,Female,6.6,6.0,7.0,3.0,1.0,Instagram,7.0 +U363,27,Male,7.8,5.0,8.0,0.0,4.0,TikTok,6.0 +U364,48,Female,5.0,8.0,6.0,5.0,0.0,LinkedIn,10.0 +U365,48,Male,5.7,7.0,7.0,1.0,3.0,Facebook,9.0 +U366,27,Female,3.6,8.0,5.0,2.0,1.0,TikTok,10.0 +U367,18,Male,3.2,9.0,5.0,1.0,3.0,Instagram,10.0 +U368,16,Male,2.5,8.0,4.0,4.0,4.0,YouTube,10.0 +U369,48,Female,3.9,6.0,5.0,0.0,2.0,LinkedIn,10.0 +U370,25,Male,8.3,5.0,8.0,4.0,2.0,Facebook,7.0 +U371,44,Male,2.7,9.0,5.0,5.0,1.0,TikTok,10.0 +U372,28,Female,5.0,7.0,5.0,4.0,0.0,LinkedIn,9.0 +U373,27,Other,3.2,9.0,6.0,4.0,5.0,X (Twitter),10.0 +U374,46,Male,4.4,7.0,7.0,5.0,1.0,YouTube,8.0 +U375,17,Male,5.8,6.0,7.0,1.0,5.0,TikTok,9.0 +U376,38,Female,4.2,8.0,5.0,6.0,1.0,X (Twitter),10.0 +U377,32,Male,7.5,4.0,7.0,2.0,3.0,Instagram,6.0 +U378,41,Male,8.3,4.0,10.0,3.0,1.0,X (Twitter),6.0 +U379,23,Male,1.5,9.0,3.0,2.0,3.0,TikTok,10.0 +U380,44,Female,4.3,8.0,7.0,4.0,2.0,TikTok,10.0 +U381,41,Female,5.6,6.0,7.0,2.0,2.0,Instagram,9.0 +U382,25,Male,7.5,4.0,8.0,3.0,3.0,X (Twitter),7.0 +U383,41,Male,4.7,6.0,6.0,1.0,2.0,X (Twitter),9.0 +U384,49,Male,5.5,7.0,5.0,5.0,0.0,Instagram,10.0 +U385,22,Female,3.4,8.0,6.0,4.0,4.0,X (Twitter),10.0 +U386,19,Male,2.7,8.0,5.0,2.0,1.0,LinkedIn,9.0 +U387,26,Male,8.1,5.0,7.0,2.0,3.0,Instagram,7.0 +U388,44,Female,2.2,9.0,4.0,4.0,4.0,YouTube,10.0 +U389,40,Male,9.7,4.0,10.0,2.0,2.0,TikTok,6.0 +U390,36,Female,6.8,5.0,8.0,3.0,4.0,YouTube,8.0 +U391,25,Male,6.7,7.0,7.0,5.0,3.0,LinkedIn,8.0 +U392,24,Male,6.5,7.0,6.0,1.0,1.0,LinkedIn,7.0 +U393,39,Male,5.0,5.0,6.0,4.0,5.0,Facebook,8.0 +U394,33,Female,6.2,6.0,7.0,4.0,1.0,Facebook,9.0 +U395,47,Female,5.2,7.0,7.0,3.0,2.0,Facebook,9.0 +U396,39,Female,6.5,5.0,7.0,2.0,2.0,Instagram,8.0 +U397,38,Male,5.4,8.0,9.0,5.0,3.0,X (Twitter),8.0 +U398,47,Female,3.8,8.0,4.0,2.0,4.0,LinkedIn,10.0 +U399,27,Male,3.9,7.0,5.0,0.0,4.0,YouTube,10.0 +U400,28,Male,3.7,8.0,5.0,4.0,3.0,YouTube,10.0 +U401,38,Female,7.2,5.0,8.0,1.0,2.0,YouTube,9.0 +U402,40,Male,4.4,7.0,5.0,4.0,0.0,YouTube,10.0 +U403,45,Male,2.1,10.0,5.0,6.0,1.0,Facebook,10.0 +U404,32,Male,6.2,6.0,8.0,4.0,5.0,YouTube,7.0 +U405,35,Male,8.2,4.0,9.0,5.0,0.0,X (Twitter),6.0 +U406,40,Male,6.9,3.0,8.0,7.0,1.0,Facebook,6.0 +U407,37,Female,4.4,7.0,6.0,5.0,4.0,YouTube,10.0 +U408,28,Male,2.6,10.0,3.0,4.0,3.0,Instagram,10.0 +U409,34,Female,5.3,6.0,6.0,6.0,4.0,TikTok,8.0 +U410,27,Female,5.2,4.0,7.0,7.0,3.0,TikTok,9.0 +U411,34,Male,3.8,7.0,6.0,7.0,5.0,TikTok,10.0 +U412,27,Male,6.5,7.0,7.0,2.0,3.0,Facebook,10.0 +U413,24,Male,8.8,5.0,9.0,0.0,3.0,YouTube,6.0 +U414,22,Female,5.9,5.0,6.0,2.0,1.0,LinkedIn,9.0 +U415,43,Male,5.8,6.0,7.0,4.0,3.0,LinkedIn,9.0 +U416,29,Female,3.8,7.0,6.0,3.0,2.0,LinkedIn,8.0 +U417,46,Male,8.0,4.0,8.0,3.0,2.0,Facebook,9.0 +U418,34,Female,5.4,4.0,5.0,4.0,4.0,X (Twitter),10.0 +U419,31,Male,4.5,8.0,5.0,0.0,4.0,Instagram,10.0 +U420,20,Male,5.6,8.0,8.0,3.0,5.0,Instagram,8.0 +U421,27,Other,6.3,4.0,9.0,5.0,5.0,TikTok,6.0 +U422,40,Female,5.8,7.0,8.0,4.0,4.0,TikTok,8.0 +U423,36,Male,7.7,5.0,8.0,4.0,0.0,LinkedIn,8.0 +U424,38,Female,6.2,7.0,6.0,6.0,5.0,LinkedIn,10.0 +U425,31,Female,5.0,7.0,6.0,3.0,4.0,TikTok,10.0 +U426,29,Female,6.2,6.0,9.0,2.0,1.0,YouTube,8.0 +U427,46,Female,6.4,6.0,8.0,5.0,4.0,X (Twitter),8.0 +U428,20,Male,3.2,7.0,5.0,5.0,1.0,X (Twitter),9.0 +U429,38,Male,7.6,6.0,9.0,4.0,3.0,X (Twitter),8.0 +U430,44,Male,5.0,6.0,5.0,3.0,1.0,X (Twitter),10.0 +U431,26,Female,5.0,6.0,6.0,3.0,1.0,TikTok,10.0 +U432,33,Female,6.9,6.0,7.0,0.0,4.0,Instagram,7.0 +U433,27,Male,6.8,5.0,8.0,5.0,4.0,TikTok,7.0 +U434,24,Male,4.2,8.0,7.0,5.0,3.0,X (Twitter),7.0 +U435,25,Female,6.9,7.0,7.0,0.0,2.0,TikTok,9.0 +U436,32,Other,4.5,8.0,5.0,5.0,0.0,X (Twitter),10.0 +U437,22,Female,8.8,2.0,9.0,4.0,2.0,Facebook,7.0 +U438,28,Male,4.0,7.0,5.0,1.0,3.0,LinkedIn,9.0 +U439,24,Male,5.9,7.0,6.0,1.0,2.0,Instagram,8.0 +U440,42,Male,4.1,6.0,6.0,4.0,2.0,Facebook,9.0 +U441,17,Female,9.5,4.0,9.0,2.0,2.0,YouTube,4.0 +U442,20,Male,3.3,8.0,4.0,7.0,2.0,TikTok,10.0 +U443,44,Female,6.4,5.0,6.0,5.0,3.0,YouTube,6.0 +U444,34,Male,4.2,8.0,4.0,4.0,2.0,TikTok,10.0 +U445,23,Female,4.9,7.0,7.0,4.0,5.0,LinkedIn,7.0 +U446,16,Female,6.3,6.0,7.0,6.0,0.0,Facebook,8.0 +U447,37,Female,7.7,4.0,8.0,5.0,2.0,LinkedIn,8.0 +U448,32,Male,8.4,5.0,9.0,4.0,1.0,X (Twitter),7.0 +U449,22,Male,2.7,6.0,6.0,1.0,3.0,LinkedIn,8.0 +U450,40,Male,6.0,4.0,6.0,9.0,3.0,LinkedIn,8.0 +U451,19,Female,8.1,5.0,9.0,1.0,5.0,X (Twitter),7.0 +U452,21,Female,2.5,9.0,3.0,2.0,5.0,TikTok,10.0 +U453,46,Female,6.1,6.0,8.0,0.0,1.0,X (Twitter),6.0 +U454,34,Female,5.5,6.0,5.0,4.0,3.0,Facebook,10.0 +U455,42,Female,6.2,6.0,6.0,3.0,3.0,LinkedIn,6.0 +U456,25,Male,4.1,6.0,5.0,5.0,1.0,X (Twitter),9.0 +U457,41,Female,2.9,8.0,3.0,0.0,3.0,LinkedIn,10.0 +U458,34,Male,2.1,9.0,4.0,5.0,2.0,Facebook,10.0 +U459,18,Male,7.3,5.0,8.0,1.0,1.0,YouTube,7.0 +U460,28,Female,7.5,6.0,8.0,3.0,2.0,Facebook,6.0 +U461,43,Male,6.0,6.0,5.0,5.0,1.0,Instagram,9.0 +U462,35,Male,4.5,7.0,8.0,4.0,4.0,Instagram,8.0 +U463,43,Female,2.6,9.0,5.0,4.0,5.0,Instagram,10.0 +U464,23,Male,6.5,5.0,10.0,5.0,3.0,LinkedIn,5.0 +U465,16,Female,3.5,8.0,5.0,4.0,3.0,TikTok,10.0 +U466,18,Male,4.2,7.0,7.0,2.0,3.0,TikTok,10.0 +U467,28,Male,4.4,7.0,7.0,3.0,4.0,TikTok,8.0 +U468,43,Male,5.1,6.0,6.0,3.0,4.0,LinkedIn,9.0 +U469,40,Male,4.1,6.0,7.0,1.0,2.0,Facebook,10.0 +U470,48,Female,5.6,6.0,6.0,5.0,3.0,TikTok,8.0 +U471,21,Male,4.4,5.0,6.0,4.0,3.0,YouTube,7.0 +U472,47,Female,6.8,5.0,4.0,3.0,1.0,Instagram,10.0 +U473,36,Female,7.4,3.0,8.0,5.0,4.0,Facebook,5.0 +U474,31,Female,5.7,7.0,5.0,3.0,5.0,Instagram,10.0 +U475,36,Female,6.3,7.0,7.0,3.0,4.0,TikTok,8.0 +U476,26,Female,9.3,3.0,8.0,2.0,1.0,Facebook,6.0 +U477,34,Male,3.4,8.0,5.0,4.0,0.0,X (Twitter),10.0 +U478,35,Male,3.3,9.0,6.0,4.0,4.0,YouTube,10.0 +U479,33,Male,6.8,6.0,8.0,2.0,1.0,TikTok,9.0 +U480,29,Female,9.0,4.0,10.0,3.0,3.0,TikTok,4.0 +U481,30,Male,5.0,7.0,8.0,2.0,5.0,YouTube,7.0 +U482,46,Female,8.6,4.0,8.0,2.0,2.0,YouTube,4.0 +U483,16,Female,5.7,6.0,6.0,0.0,2.0,X (Twitter),8.0 +U484,18,Male,4.3,7.0,7.0,3.0,3.0,TikTok,10.0 +U485,31,Female,3.7,7.0,7.0,0.0,1.0,LinkedIn,9.0 +U486,38,Male,6.6,6.0,8.0,2.0,1.0,TikTok,7.0 +U487,26,Male,3.9,8.0,5.0,2.0,3.0,Instagram,10.0 +U488,27,Female,5.6,7.0,7.0,5.0,2.0,Facebook,8.0 +U489,25,Female,8.4,5.0,9.0,1.0,3.0,Facebook,7.0 +U490,47,Female,4.9,7.0,7.0,1.0,7.0,Instagram,10.0 +U491,31,Male,7.9,5.0,8.0,2.0,3.0,Facebook,6.0 +U492,23,Male,3.3,10.0,4.0,2.0,1.0,YouTube,10.0 +U493,27,Female,4.5,5.0,7.0,6.0,5.0,Facebook,9.0 +U494,39,Female,3.0,7.0,2.0,1.0,0.0,Facebook,10.0 +U495,43,Female,5.6,8.0,6.0,2.0,0.0,Instagram,10.0 +U496,23,Male,6.9,5.0,7.0,4.0,2.0,X (Twitter),10.0 +U497,43,Female,5.6,7.0,6.0,5.0,2.0,Facebook,9.0 +U498,41,Male,7.7,5.0,7.0,2.0,2.0,LinkedIn,8.0 +U499,23,Male,4.2,9.0,7.0,0.0,2.0,Facebook,9.0 +U500,43,Female,5.9,5.0,8.0,3.0,3.0,X (Twitter),7.0 diff --git a/src/Backend/test_data/parquet/capitals_clean.parquet b/src/Backend/test_data/parquet/capitals_clean.parquet new file mode 100644 index 0000000..5593be6 Binary files /dev/null and b/src/Backend/test_data/parquet/capitals_clean.parquet differ