diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml index 264286296008..11a1c5585886 100644 --- a/.github/workflows/linux-build.yml +++ b/.github/workflows/linux-build.yml @@ -65,11 +65,11 @@ jobs: # TODO: Install a newer cmake here until we update the images upstream pip install cmake==3.30.4 - - uses: assignUser/stash/restore@v1 - with: - token: '${{ secrets.ARTIFACT_CACHE_TOKEN }}' - path: '${{ env.CCACHE_DIR }}' - key: ccache-linux-adapters + # - uses: assignUser/stash/restore@v1 + # with: + # token: '${{ secrets.ARTIFACT_CACHE_TOKEN }}' + # path: '${{ env.CCACHE_DIR }}' + # key: ccache-linux-adapters - name: "Zero Ccache Statistics" run: | diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp index 404125842f89..9c2ff092cef3 100644 --- a/velox/experimental/cudf/exec/CudfConversion.cpp +++ b/velox/experimental/cudf/exec/CudfConversion.cpp @@ -119,7 +119,12 @@ RowVectorPtr CudfFromVelox::getOutput() { // Combine selected RowVectors into a single RowVector auto input = mergeRowVectors(selectedInputs, inputs_[0]->pool()); - + // print physical type of each column in output + for (auto i = 0; i < input->type()->size(); i++) { + std::cout << "input column " << i + << " type: " << input->childAt(i)->type()->toString() << " " + << input->childAt(i)->type()->kindName() << std::endl; + } // Remove processed inputs inputs_.erase(inputs_.begin(), inputs_.begin() + selectedInputs.size()); currentOutputSize_ -= totalSize; @@ -134,6 +139,12 @@ RowVectorPtr CudfFromVelox::getOutput() { // Convert RowVector to cudf table auto tbl = with_arrow::toCudfTable(input, input->pool(), stream); + // print types of tbl->view() + for (auto i = 0; i < tbl->num_columns(); i++) { + std::cout << "input cudf column " << i + << " type: " << static_cast(tbl->get_column(i).type().id()) + << std::endl; + } stream.synchronize(); @@ -206,8 +217,29 @@ RowVectorPtr CudfToVelox::getOutput() { } RowVectorPtr output = with_arrow::toVeloxColumn(tbl->view(), pool(), "", stream); + + // print types of tbl->view() + for (auto i = 0; i < tbl->num_columns(); i++) { + std::cout << "cudf column " << i + << " type: " << static_cast(tbl->get_column(i).type().id()) + << std::endl; + } stream.synchronize(); finished_ = noMoreInput_ && inputs_.empty(); + std::cout << "output.type: " << output->type()->toString() << std::endl; + std::cout << "outputType_: " << outputType_->toString() << std::endl; + // print physical type of each column in output + for (auto i = 0; i < output->type()->size(); i++) { + std::cout << "column " << i + << " type: " << output->childAt(i)->type()->toString() << " " + << output->childAt(i)->type()->kindName() << std::endl; + } + // print physical type of each column in outputType_ + for (auto i = 0; i < outputType_->size(); i++) { + std::cout << "column " << i + << " type: " << outputType_->childAt(i)->toString() << " " + << outputType_->childAt(i)->kindName() << std::endl; + } output->setType(outputType_); return output; } diff --git a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp index 34887f206443..a43397de52b2 100644 --- a/velox/experimental/cudf/exec/VeloxCudfInterop.cpp +++ b/velox/experimental/cudf/exec/VeloxCudfInterop.cpp @@ -84,6 +84,34 @@ cudf::type_id velox_to_cudf_type_id(const TypePtr& type) { } namespace with_arrow { +// Print ArrowSchema format strings recursively +void printArrowSchemaFormat(const ArrowSchema& arrowSchema, int depth = 0) { + std::string indent(depth * 2, ' '); + if (depth == 0) { + std::cout << "arrowSchema.format: " << arrowSchema.format << std::endl; + } + + for (int64_t i = 0; i < arrowSchema.n_children; ++i) { + const ArrowSchema* child = arrowSchema.children[i]; + if (child != nullptr) { + std::cout << indent << " child[" << i << "].format: " << child->format + << std::endl; + if (child->n_children > 0) { + printArrowSchemaFormat(*child, depth + 1); + } + } + } + + if (arrowSchema.dictionary != nullptr) { + std::cout << indent + << " dictionary.format: " << arrowSchema.dictionary->format + << std::endl; + if (arrowSchema.dictionary->n_children > 0) { + printArrowSchemaFormat(*arrowSchema.dictionary, depth + 1); + } + } +} + std::unique_ptr toCudfTable( const facebook::velox::RowVectorPtr& veloxTable, facebook::velox::memory::MemoryPool* pool, @@ -102,6 +130,7 @@ std::unique_ptr toCudfTable( std::dynamic_pointer_cast(veloxTable), arrowSchema, arrowOptions); + printArrowSchemaFormat(arrowSchema); auto tbl = cudf::from_arrow(&arrowSchema, &arrowArray, stream); // Release Arrow resources diff --git a/velox/experimental/cudf/tests/OrderByTest.cpp b/velox/experimental/cudf/tests/OrderByTest.cpp index 334671305c43..172803d27851 100644 --- a/velox/experimental/cudf/tests/OrderByTest.cpp +++ b/velox/experimental/cudf/tests/OrderByTest.cpp @@ -412,4 +412,310 @@ TEST_F(OrderByTest, outputBatchRows) { } #endif +TEST_F(OrderByTest, allTypesWithIntegerKey) { + vector_size_t batchSize = 500; + std::vector vectors; + + // Define the types we'll use + auto keyType = INTEGER(); + auto boolType = BOOLEAN(); + auto tinyintType = TINYINT(); + auto smallintType = SMALLINT(); + auto bigintType = BIGINT(); + auto realType = REAL(); + auto doubleType = DOUBLE(); + auto hugeIntType = HUGEINT(); // Note: not supported by ArrowSchema + auto varcharType = VARCHAR(); + auto varbinaryType = VARBINARY(); // Unsupported + auto timestampType = TIMESTAMP(); + auto arrayType = ARRAY(INTEGER()); + auto rowType = ROW({{"nested1", BOOLEAN()}, {"nested2", INTEGER()}}); + auto shortDecimalType = + DECIMAL(10, 2); // Unsupported due to a bug in ArrowSchema in Velox + auto longDecimalType = DECIMAL(38, 10); + auto dateType = DATE(); + auto intervalDayTimeType = INTERVAL_DAY_TIME(); + auto intervalYearMonthType = INTERVAL_YEAR_MONTH(); // Unsupported + // MAP, VARBINARY, UKNOWN, FUNCTION, OPAQUE // Unsupported + + for (int32_t i = 0; i < 3; ++i) { + std::vector children; + std::vector names; + + // Integer key column for ordering + auto c0 = makeFlatVector( + batchSize, + [&](vector_size_t row) { return batchSize * i + row; }, + nullEvery(7), + keyType); + children.push_back(c0); + names.push_back("c0"); + + // BOOLEAN + auto c1 = makeFlatVector( + batchSize, + [](vector_size_t row) { return row % 2 == 0; }, + nullEvery(5), + boolType); + children.push_back(c1); + names.push_back("c1"); + + // TINYINT + auto c2 = makeFlatVector( + batchSize, + [](vector_size_t row) { return row % 127; }, + nullEvery(9), + tinyintType); + children.push_back(c2); + names.push_back("c2"); + + // SMALLINT + auto c3 = makeFlatVector( + batchSize, + [](vector_size_t row) { return row % 32767; }, + nullEvery(11), + smallintType); + children.push_back(c3); + names.push_back("c3"); + + // BIGINT + auto c4 = makeFlatVector( + batchSize, + [](vector_size_t row) { return row * 10000; }, + nullEvery(13), + bigintType); + children.push_back(c4); + names.push_back("c4"); + + // HUGEINT Note:(not supported by ArrowSchema) + // auto c5 = makeFlatVector( + // batchSize, + // [](vector_size_t row) { return row * 1000000000000000000ull; }, + // nullEvery(17), + // hugeIntType); + // children.push_back(c5); + + // REAL (float) + auto c6 = makeFlatVector( + batchSize, + [](vector_size_t row) { return row * 0.1f; }, + nullEvery(15), + realType); + children.push_back(c6); + names.push_back("c6"); + + // DOUBLE + auto c7 = makeFlatVector( + batchSize, + [](vector_size_t row) { return row * 0.01; }, + nullEvery(17), + doubleType); + children.push_back(c7); + names.push_back("c7"); + + // VARCHAR + auto c8 = makeFlatVector( + batchSize, + [](vector_size_t row) { + return StringView::makeInline("str_" + std::to_string(row)); + }, + nullEvery(19), + varcharType); + children.push_back(c8); + names.push_back("c8"); + + // VARBINARY + // auto c9 = makeFlatVector( + // batchSize, + // [](vector_size_t row) { + // return StringView::makeInline("bin_" + std::to_string(row)); + // }, + // nullEvery(21), + // varbinaryType); + // children.push_back(c9); + // names.push_back("c9"); + + // TIMESTAMP + auto c10 = makeFlatVector( + batchSize, + [](vector_size_t row) { + // seconds, nanoseconds + return Timestamp(1600000000 + row, row * 1000); + }, + nullEvery(23), + timestampType); + children.push_back(c10); + names.push_back("c10"); + + // ARRAY(INTEGER()) + auto c11 = makeArrayVector( + batchSize, + [](vector_size_t row) { return row % 5 + 1; }, // array sizes + [](vector_size_t idx) { return idx * 2; }, // array contents + nullEvery(13)); + children.push_back(c11); + names.push_back("c11"); + + // ROW/STRUCT type + auto nestedBool = makeFlatVector( + batchSize, + [](vector_size_t row) { return row % 3 == 0; }, + nullEvery(11), + BOOLEAN()); + auto nestedInt = makeFlatVector( + batchSize, + [](vector_size_t row) { return row * 5; }, + nullEvery(13), + INTEGER()); + auto c12 = makeRowVector({"nested1", "nested2"}, {nestedBool, nestedInt}); + children.push_back(c12); + names.push_back("c12"); + + // DECIMAL(10, 2) - short decimal + // auto c13 = makeFlatVector( + // batchSize, + // [](vector_size_t row) { return row * 123; }, // Will be interpreted + // as row*1.23 nullEvery(27), shortDecimalType); + // children.push_back(c13); + // names.push_back("c13"); + + // DECIMAL(38, 10) - long decimal + auto c14 = makeFlatVector( + batchSize, + [](vector_size_t row) { + return static_cast(row) * 1234567890; + }, + nullEvery(29), + longDecimalType); + children.push_back(c14); + names.push_back("c14"); + + // DATE + auto c15 = makeFlatVector( + batchSize, + [](vector_size_t row) { + // Days since epoch, starting from 2020-01-01 (18262) plus row offset + return 18262 + row % 1000; + }, + nullEvery(31), + dateType); + children.push_back(c15); + names.push_back("c15"); + + // INTERVAL_DAY_TIME - stored as int64_t + auto c16 = makeFlatVector( + batchSize, + [](vector_size_t row) { + // Interval of days and milliseconds: row days and row*100 + // milliseconds + return row * 86400000 + row * 100; + }, + nullEvery(33), + intervalDayTimeType); + children.push_back(c16); + names.push_back("c16"); + + // INTERVAL_YEAR_MONTH - stored as int32_t + // auto c17 = makeFlatVector( + // batchSize, + // [](vector_size_t row) { + // // Interval of months: row months + // return row % 120; // 0-120 months (0-10 years) + // }, + // nullEvery(35), + // intervalYearMonthType); + // children.push_back(c17); + // names.push_back("c17"); + + // Dictionary vector + auto flatVector = makeFlatVector( + batchSize, [](vector_size_t row) { return row; }); + auto indices = + makeIndices(batchSize, [](vector_size_t i) { return i % 100; }); + auto nulls = + makeNulls(batchSize, [](vector_size_t row) { return row % 3 == 0; }); + + auto c18 = + BaseVector::wrapInDictionary(nulls, indices, batchSize, flatVector); + // For comparison, create a dictionary without nulls + // auto dictWithoutNulls = BaseVector::wrapInDictionary(nullptr, indices, + // batchSize, flatVector); + children.push_back(c18); + names.push_back("c18"); + + vectors.push_back(makeRowVector(names, children)); + } + + createDuckDbTable(vectors); + + // Test ordering by the integer key + testSingleKey(vectors, "c0"); + + // Test with a filter + testSingleKey(vectors, "c0", "c0 % 100 = 0"); + + // Test descending order + core::PlanNodeId orderById; + auto plan = PlanBuilder() + .values(vectors) + .orderBy({"c0 DESC NULLS LAST"}, false) + .capturePlanNodeId(orderById) + .planNode(); + runTest( + plan, orderById, "SELECT * FROM tmp ORDER BY c0 DESC NULLS LAST", {0}); + + // Test with secondary key + testTwoKeys(vectors, "c0", "c1"); + + // Test ordering by boolean column + testSingleKey(vectors, "c1"); + + // Test ordering by tinyint column + testSingleKey(vectors, "c2"); + + // Test ordering by smallint column + testSingleKey(vectors, "c3"); + + // Test ordering by bigint column + testSingleKey(vectors, "c4"); + + // Test ordering by date column + testSingleKey(vectors, "c15"); + + // Test ordering by interval column + testSingleKey(vectors, "c16"); + + // Test ordering by dictionary column + testSingleKey(vectors, "c18"); + + // Test multiple ordering directions + core::PlanNodeId multiOrderById; + auto multiOrderPlan = + PlanBuilder() + .values(vectors) + .orderBy({"c0 ASC NULLS FIRST", "c1 DESC NULLS LAST"}, false) + .capturePlanNodeId(multiOrderById) + .planNode(); + runTest( + multiOrderPlan, + multiOrderById, + "SELECT * FROM tmp ORDER BY c0 ASC NULLS FIRST, c1 DESC NULLS LAST", + {0, 1}); + + // Test with complex filter + testSingleKey(vectors, "c0", "c1 = true AND c2 < 100"); + + // Test with three keys + core::PlanNodeId threeKeysOrderById; + auto threeKeysPlan = PlanBuilder() + .values(vectors) + .orderBy({"c0 ASC", "c1 DESC", "c2 ASC"}, false) + .capturePlanNodeId(threeKeysOrderById) + .planNode(); + runTest( + threeKeysPlan, + threeKeysOrderById, + "SELECT * FROM tmp ORDER BY c0 ASC, c1 DESC, c2 ASC", + {0, 1, 2}); +} } // namespace diff --git a/velox/type/Type.cpp b/velox/type/Type.cpp index a498dd1d4592..7ce6771f9474 100644 --- a/velox/type/Type.cpp +++ b/velox/type/Type.cpp @@ -119,6 +119,26 @@ std::string mapTypeKindToName(const TypeKind& typeKind) { return found->second; } +namespace { +template +static int32_t kindSize() { + return sizeof(typename KindToFlatVector::HashRowType); +} + +static int32_t typeKindSize(TypeKind kind) { + if (kind == TypeKind::UNKNOWN) { + return sizeof(UnknownValue); + } + + return VELOX_DYNAMIC_TYPE_DISPATCH(kindSize, kind); +} +} // namespace + +uint8_t getDecimalBitWidth(const Type& type) { + // Get the physical type's bit width + return typeKindSize(type.kind()) * 8; +} + std::pair getDecimalPrecisionScale(const Type& type) { if (type.isShortDecimal()) { const auto& decimalType = static_cast(type); diff --git a/velox/type/Type.h b/velox/type/Type.h index e95d7abd87d6..12a9355f38d2 100644 --- a/velox/type/Type.h +++ b/velox/type/Type.h @@ -845,6 +845,8 @@ FOLLY_ALWAYS_INLINE bool isDecimalName(const std::string& name) { return (name == "DECIMAL"); } +uint8_t getDecimalBitWidth(const Type& type); + std::pair getDecimalPrecisionScale(const Type& type); class UnknownType : public CanProvideCustomComparisonType { diff --git a/velox/vector/arrow/Bridge.cpp b/velox/vector/arrow/Bridge.cpp index 2c81819163c4..f8e3fe053311 100644 --- a/velox/vector/arrow/Bridge.cpp +++ b/velox/vector/arrow/Bridge.cpp @@ -26,6 +26,8 @@ #include "velox/vector/VectorTypeUtils.h" #include "velox/vector/arrow/Abi.h" +#include + namespace facebook::velox { namespace { @@ -265,7 +267,8 @@ const char* exportArrowFormatStr( if (type->isDecimal()) { // Decimal types encode the precision, scale values. const auto& [precision, scale] = getDecimalPrecisionScale(*type); - formatBuffer = fmt::format("d:{},{}", precision, scale); + auto const width = getDecimalBitWidth(*type); + formatBuffer = fmt::format("d:{},{},{}", precision, scale, width); return formatBuffer.c_str(); } @@ -1225,25 +1228,28 @@ TypePtr parseDecimalFormat(const char* format) { auto firstCommaIdx = formatStr.find(',', 2); auto secondCommaIdx = formatStr.find(',', firstCommaIdx + 1); - if (firstCommaIdx == std::string::npos || - formatStr.size() == firstCommaIdx + 1 || - (secondCommaIdx != std::string::npos && - formatStr.size() == secondCommaIdx + 1)) { - VELOX_USER_FAIL(invalidFormatMsg, format); - } + // if (firstCommaIdx == std::string::npos || + // formatStr.size() == firstCommaIdx + 1 || + // (secondCommaIdx != std::string::npos && + // formatStr.size() == secondCommaIdx + 1)) { + // VELOX_USER_FAIL(invalidFormatMsg, format); + // } // Parse "d:". int precision = std::stoi(&format[2], &sz); int scale = std::stoi(&format[firstCommaIdx + 1], &sz); // If bitwidth is provided, check if it is equal to 128. - if (secondCommaIdx != std::string::npos) { - int bitWidth = std::stoi(&format[secondCommaIdx + 1], &sz); - VELOX_USER_CHECK_EQ( - bitWidth, - 128, - "Conversion failed for '{}'. Velox decimal does not support custom bitwidth.", - format); - } + // if (secondCommaIdx != std::string::npos) { + // int bitWidth = std::stoi(&format[secondCommaIdx + 1], &sz); + // VELOX_USER_CHECK_EQ( + // bitWidth, + // 128, + // "Conversion failed for '{}'. Velox decimal does not support custom + // bitwidth.", format); + // } + + std::cout << "pformat: " << format << std::endl; + std::cout << "precision: " << precision << " scale: " << scale << std::endl; return DECIMAL(precision, scale); } catch (std::invalid_argument&) { VELOX_USER_FAIL(invalidFormatMsg, format); @@ -1255,6 +1261,8 @@ TypePtr importFromArrowImpl( const ArrowSchema& arrowSchema) { VELOX_CHECK_NOT_NULL(format); + std::cout << "format: " << format << std::endl; + switch (format[0]) { case 'b': return BOOLEAN();