Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions SqlPipeline/SqlPipeline/Private/duckdb/Get-DuckDBBestType.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
function Get-DuckDBBestType {
<#
.SYNOPSIS
Returns the widest DuckDB SQL type that can represent all non-null values
in $Values. Used for multi-row type inference.
.DESCRIPTION
Iterates over the values and widens the inferred type whenever a conflict
is detected:
BOOLEAN + BIGINT → BIGINT
BOOLEAN + DOUBLE → DOUBLE
BIGINT + DOUBLE → DOUBLE
All other conflicts → VARCHAR
Null values are skipped. Returns VARCHAR when all values are null.
#>
[CmdletBinding()]
[OutputType([string])]
param(
[Parameter(Mandatory)] $Values
)

$current = $null

foreach ($v in $Values) {
if ($null -eq $v) { continue }

$t = ConvertTo-DuckDBType -Value $v

if ($null -eq $current) {
$current = $t
continue
}
if ($current -eq $t) { continue }

# Determine the wider of the two types.
# Sort alphabetically so the switch key is order-independent.
$a, $b = ($current, $t) | Sort-Object
$current = switch ("$a+$b") {
'BIGINT+BOOLEAN' { 'BIGINT'; break }
'BIGINT+DOUBLE' { 'DOUBLE'; break }
'BOOLEAN+DOUBLE' { 'DOUBLE'; break }
default { 'VARCHAR'; break }
}

if ($current -eq 'VARCHAR') { break } # Can't get wider — exit early
}

if ($null -eq $current) { return 'VARCHAR' }
return $current
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ function Initialize-DuckDBTable {
param(
[Parameter(Mandatory)] [DuckDB.NET.Data.DuckDBConnection]$Connection,
[Parameter(Mandatory)] [string]$TableName,
[Parameter(Mandatory)] $SampleRow,
[Parameter(Mandatory)] $SampleRows,
[string[]]$PKColumns = @()
)

Expand All @@ -26,9 +26,13 @@ function Initialize-DuckDBTable {

Write-Verbose "[$TableName] Creating new table..."

$colDefs = $SampleRow.PSObject.Properties | ForEach-Object {
$sqlType = ConvertTo-DuckDBType -Value $_.Value
" ""$($_.Name)"" $sqlType"
# Use the first row for column names, but derive the type from all sample rows
# so that mixed-type columns (e.g. int and double) get the correct wider type.
$colDefs = $SampleRows[0].PSObject.Properties.Name | ForEach-Object {
$col = $_
$values = $SampleRows | ForEach-Object { $_.$col }
$sqlType = Get-DuckDBBestType -Values $values
" ""$col"" $sqlType"
}

$pkDef = if ($PKColumns.Count -gt 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,16 @@ function Invoke-BufferedWrite {

if ($Data.Count -eq 0) { return }

# Use the first 100 rows for type inference so mixed-type columns
# (e.g. integer in one row, double in another) get the correct wider type.
$sampleRows = $Data.GetRange(0, [Math]::Min(100, $Data.Count))

# 1. Create table if it does not exist
Initialize-DuckDBTable -Connection $Connection -TableName $TableName `
-SampleRow $Data[0] -PKColumns $PKColumns
-SampleRows $sampleRows -PKColumns $PKColumns

# 2. Extend schema with new columns
Sync-DuckDBSchema -Connection $Connection -TableName $TableName -SampleRow $Data[0]
Sync-DuckDBSchema -Connection $Connection -TableName $TableName -SampleRows $sampleRows

# 3. Normalize missing columns
$expectedCols = Get-DuckDBColumns -Connection $Connection -TableName $TableName
Expand Down
7 changes: 4 additions & 3 deletions SqlPipeline/SqlPipeline/Private/duckdb/Sync-DuckDBSchema.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ function Sync-DuckDBSchema {
param(
[Parameter(Mandatory)] [DuckDB.NET.Data.DuckDBConnection]$Connection,
[Parameter(Mandatory)] [string]$TableName,
[Parameter(Mandatory)] $SampleRow
[Parameter(Mandatory)] $SampleRows
)

$existingCols = Get-DuckDBColumns -Connection $Connection -TableName $TableName
$incomingCols = $SampleRow.PSObject.Properties.Name
$incomingCols = $SampleRows[0].PSObject.Properties.Name

$newCols = $incomingCols | Where-Object { $_ -notin $existingCols }

Expand All @@ -26,7 +26,8 @@ function Sync-DuckDBSchema {
}

foreach ($col in $newCols) {
$sqlType = ConvertTo-DuckDBType -Value $SampleRow.$col
$values = $SampleRows | ForEach-Object { $_.$col }
$sqlType = Get-DuckDBBestType -Values $values
Write-Verbose "[$TableName] New column: $col ($sqlType)"
Invoke-DuckDBQuery -Connection $Connection -Query `
"ALTER TABLE $TableName ADD COLUMN IF NOT EXISTS $col $sqlType"
Expand Down
45 changes: 44 additions & 1 deletion SqlPipeline/SqlPipeline/Private/duckdb/Write-DuckDBAppender.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,20 @@ function Write-DuckDBAppender {
[switch]$SimpleTypesOnly = $false
)

# Read column types from schema so we can cast numeric values correctly.
# DuckDB.NET's AppendValue reinterprets bytes rather than converting when
# the .NET type does not match the column type (e.g. Int64 into a DOUBLE
# column yields 7.4e-323 instead of 15).
$columnTypes = @{}
$schemaCmd = $Connection.CreateCommand()
$schemaCmd.CommandText = "DESCRIBE ""$TableName"""
$schemaReader = $schemaCmd.ExecuteReader()
while ($schemaReader.Read()) {
$columnTypes[$schemaReader.GetString(0)] = $schemaReader.GetString(1)
}
$schemaReader.Close()
$schemaCmd.Dispose()

$appender = $Connection.CreateAppender($TableName)
$propNames = $null # cached once from first row

Expand All @@ -31,9 +45,38 @@ function Write-DuckDBAppender {
} elseif ($val -is [float]) {
$val = [double]$val
}

# Cast values to the declared column type so DuckDB.NET picks the
# correct AppendValue overload. Without this:
# [long] → DOUBLE reinterprets raw bytes (15 becomes 7.4e-323)
# [bool] → BIGINT throws "Cannot write Boolean to BigInt column"
# [long] → VARCHAR throws "Cannot write Int64 to Varchar column"
if ($null -ne $val -and $columnTypes.ContainsKey($name)) {
$colType = $columnTypes[$name]
$isFloat = $colType -eq 'DOUBLE' -or $colType -eq 'FLOAT' -or
$colType -eq 'REAL' -or $colType -eq 'FLOAT4' -or $colType -eq 'FLOAT8'
$isInt = $colType -eq 'BIGINT' -or $colType -eq 'INTEGER' -or
$colType -eq 'HUGEINT' -or $colType -eq 'INT8' -or $colType -eq 'INT4'

if ($val -is [bool] -and $colType -ne 'BOOLEAN') {
# bool cannot be appended to non-BOOLEAN columns
if ($isFloat) { $val = [double][int]$val }
elseif ($isInt) { $val = [long][int]$val }
else { $val = [string]$val }
} elseif ($val -is [long] -and $isFloat) {
$val = [double]$val
} elseif ($val -is [double] -and $isInt) {
$val = [long]$val
} elseif ($colType -eq 'VARCHAR' -and ($val -is [long] -or $val -is [double])) {
$val = [string]$val
}
}
# Inlined ConvertTo-DuckDBValue
if ($null -eq $val) {
[void]$appenderRow.AppendValue([DBNull]::Value)
# AppendValue([DBNull]::Value) has wrong overload resolution on typed
# columns (e.g. resolves to AppendValue(bool) for DOUBLE). Use the
# dedicated AppendNullValue() method instead.
[void]$appenderRow.AppendNullValue()
} elseif (-not $SimpleTypesOnly -and (
$val -is [System.Collections.IList] -or
$val -is [PSCustomObject] -or
Expand Down
3 changes: 2 additions & 1 deletion SqlPipeline/SqlPipeline/SqlPipeline.psd1
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
RootModule = 'SqlPipeline.psm1'

# Die Versionsnummer dieses Moduls
ModuleVersion = '0.3.6'
ModuleVersion = '0.3.7'

# Unterstützte PSEditions
# CompatiblePSEditions = @()
Expand Down Expand Up @@ -126,6 +126,7 @@ PrivateData = @{

# 'ReleaseNotes' des Moduls
ReleaseNotes = '
0.3.7 DuckDB: multi-row type inference & appender fixes with numeric and boolean types
0.3.6 Adding functionality to count updates and inserts when executing the MERGE
0.3.5 Added function to show open DuckDB connections: Show-DuckDBConnection
0.3.4 Fixing package installation with PowerShell 5.1 because Expand-Archive only supports *.zip files
Expand Down
165 changes: 165 additions & 0 deletions SqlPipeline/Tests/SqlPipeline_DuckDB.Tests.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -431,3 +431,168 @@ Describe "Export-DuckDBToParquet" -Skip:(-not $script:duckDBAvailable) {
}

}


Describe "Get-DuckDBBestType - multi-row type inference" -Skip:(-not $script:duckDBAvailable) {
# Get-DuckDBBestType is private, so all assertions go through Add-RowsToDuckDB
# and the resulting DuckDB column type (read back via DESCRIBE).

AfterEach {
"typ_int","typ_double","typ_mixed","typ_null","typ_bool_int",
"typ_bool_double","typ_incompat" | ForEach-Object {
Invoke-DuckDBQuery -Query "DROP TABLE IF EXISTS $_" -ErrorAction SilentlyContinue
}
}

It "Creates BIGINT column when all sampled rows are integer" {
1..20 | ForEach-Object { [PSCustomObject]@{ Val = [int]$_ } } |
Add-RowsToDuckDB -TableName "typ_int"

$colType = (Get-DuckDBData -Query "DESCRIBE typ_int").Rows |
Where-Object { $_["column_name"] -eq "Val" } | Select-Object -First 1
$colType["column_type"] | Should -Be "BIGINT"
}

It "Creates DOUBLE column when all sampled rows are double" {
1..20 | ForEach-Object { [PSCustomObject]@{ Val = [double]($_ + 0.1) } } |
Add-RowsToDuckDB -TableName "typ_double"

$colType = (Get-DuckDBData -Query "DESCRIBE typ_double").Rows |
Where-Object { $_["column_name"] -eq "Val" } | Select-Object -First 1
$colType["column_type"] | Should -Be "DOUBLE"
}

It "Widens to DOUBLE when first rows are int but later rows are double" {
# Old single-row detection would create BIGINT; multi-row sampling creates DOUBLE.
$rows = @(
1..10 | ForEach-Object { [PSCustomObject]@{ Val = [int]$_ } }
11..15 | ForEach-Object { [PSCustomObject]@{ Val = [double]($_ + 0.5) } }
)
$rows | Add-RowsToDuckDB -TableName "typ_mixed"

$colType = (Get-DuckDBData -Query "DESCRIBE typ_mixed").Rows |
Where-Object { $_["column_name"] -eq "Val" } | Select-Object -First 1
$colType["column_type"] | Should -Be "DOUBLE"
}

It "Skips null values and still infers DOUBLE from the non-null rows" {
$rows = @(
1..5 | ForEach-Object { [PSCustomObject]@{ Val = $null } }
6..15 | ForEach-Object { [PSCustomObject]@{ Val = [double]($_ * 1.5) } }
)
$rows | Add-RowsToDuckDB -TableName "typ_null"

$colType = (Get-DuckDBData -Query "DESCRIBE typ_null").Rows |
Where-Object { $_["column_name"] -eq "Val" } | Select-Object -First 1
$colType["column_type"] | Should -Be "DOUBLE"
}

It "Widens BOOLEAN+int to BIGINT" {
$rows = @(
[PSCustomObject]@{ Flag = $true }
[PSCustomObject]@{ Flag = $false }
[PSCustomObject]@{ Flag = [int]42 }
)
$rows | Add-RowsToDuckDB -TableName "typ_bool_int"

$colType = (Get-DuckDBData -Query "DESCRIBE typ_bool_int").Rows |
Where-Object { $_["column_name"] -eq "Flag" } | Select-Object -First 1
$colType["column_type"] | Should -Be "BIGINT"
}

It "Widens BOOLEAN+double to DOUBLE" {
$rows = @(
[PSCustomObject]@{ Flag = $true }
[PSCustomObject]@{ Flag = [double]3.14 }
)
$rows | Add-RowsToDuckDB -TableName "typ_bool_double"

$colType = (Get-DuckDBData -Query "DESCRIBE typ_bool_double").Rows |
Where-Object { $_["column_name"] -eq "Flag" } | Select-Object -First 1
$colType["column_type"] | Should -Be "DOUBLE"
}

It "Falls back to VARCHAR for incompatible types (string + int)" {
$rows = @(
[PSCustomObject]@{ Val = "hello" }
[PSCustomObject]@{ Val = [int]42 }
)
$rows | Add-RowsToDuckDB -TableName "typ_incompat"

$colType = (Get-DuckDBData -Query "DESCRIBE typ_incompat").Rows |
Where-Object { $_["column_name"] -eq "Val" } | Select-Object -First 1
$colType["column_type"] | Should -Be "VARCHAR"
}

}


Describe "Write-DuckDBAppender - numeric type correctness (byte-reinterpretation fix)" -Skip:(-not $script:duckDBAvailable) {
# Before the fix, DuckDB.NET's AppendValue(Int64) on a DOUBLE column reinterpreted
# the 8 raw bytes of the long as a double, turning 15 into ~7.4e-323.

AfterEach {
"apr_int_in_double","apr_many_mixed","apr_double_in_bigint" | ForEach-Object {
Invoke-DuckDBQuery -Query "DROP TABLE IF EXISTS $_" -ErrorAction SilentlyContinue
}
}

It "Stores integer value correctly in a DOUBLE column (not as ~7.4e-323)" {
# First row establishes the column as DOUBLE; second row supplies an int.
$rows = @(
[PSCustomObject]@{ Val = [double]1.5 }
[PSCustomObject]@{ Val = [int]15 }
)
$rows | Add-RowsToDuckDB -TableName "apr_int_in_double"

$result = Get-DuckDBData -Query "SELECT Val FROM apr_int_in_double ORDER BY Val"
[double]$result.Rows[0]["Val"] | Should -Be 1.5
[double]$result.Rows[1]["Val"] | Should -Be 15.0
}

It "Integer 15 in a DOUBLE column is greater than 10 (not a subnormal ~7.4e-323)" {
$rows = @(
[PSCustomObject]@{ Val = [double]1.0 }
[PSCustomObject]@{ Val = [int]15 }
)
$rows | Add-RowsToDuckDB -TableName "apr_int_in_double"

$result = Get-DuckDBData -Query "SELECT Val FROM apr_int_in_double WHERE Val > 10"
$result.Rows.Count | Should -Be 1
[double]$result.Rows[0]["Val"] | Should -Be 15.0
}

It "All mixed int/double values are stored correctly in a DOUBLE column" {
# Multi-row sampling widens the column to DOUBLE from the start,
# then the appender must still cast each int correctly.
$rows = @(
1..5 | ForEach-Object { [PSCustomObject]@{ Score = [double]($_ * 1.5) } } # 1.5 3.0 4.5 6.0 7.5
6..10 | ForEach-Object { [PSCustomObject]@{ Score = [int]($_ * 10) } } # 60 70 80 90 100
)
$rows | Add-RowsToDuckDB -TableName "apr_many_mixed"

$result = Get-DuckDBData -Query "SELECT Score FROM apr_many_mixed ORDER BY Score"
$result.Rows.Count | Should -Be 10

# All stored values must be sensible positive numbers (rules out subnormal garbage)
foreach ($row in $result.Rows) {
[double]$row["Score"] | Should -BeGreaterThan 0
[double]$row["Score"] | Should -BeLessOrEqual 100
}
}

It "Stores double value correctly in a BIGINT column" {
# Column created as BIGINT; a double like 3.0 should be stored as 3 (truncated).
$rows = @(
[PSCustomObject]@{ Val = [int]10 }
[PSCustomObject]@{ Val = [double]3.0 }
)
$rows | Add-RowsToDuckDB -TableName "apr_double_in_bigint"

$result = Get-DuckDBData -Query "SELECT Val FROM apr_double_in_bigint ORDER BY Val"
$result.Rows.Count | Should -Be 2
[long]$result.Rows[0]["Val"] | Should -Be 3
[long]$result.Rows[1]["Val"] | Should -Be 10
}

}