From 8b8ce6bbf813c8b51ba5e38481bae9db20ca955c Mon Sep 17 00:00:00 2001 From: Rui Mo Date: Tue, 25 Mar 2025 10:07:04 +0000 Subject: [PATCH 01/10] [5962] Support struct schema evolution matching by name --- velox/connectors/hive/SplitReader.cpp | 14 +- velox/dwio/common/ScanSpec.cpp | 2 +- .../common/SelectiveStructColumnReader.cpp | 11 +- .../dwio/common/SelectiveStructColumnReader.h | 5 + velox/dwio/dwrf/reader/DwrfReader.cpp | 13 +- .../dwio/dwrf/reader/SelectiveDwrfReader.cpp | 9 +- velox/dwio/dwrf/reader/SelectiveDwrfReader.h | 5 +- .../reader/SelectiveFlatMapColumnReader.cpp | 62 +++++--- .../reader/SelectiveFlatMapColumnReader.h | 3 +- .../reader/SelectiveRepeatedColumnReader.cpp | 18 ++- .../reader/SelectiveRepeatedColumnReader.h | 6 +- .../reader/SelectiveStructColumnReader.cpp | 8 +- .../dwrf/reader/SelectiveStructColumnReader.h | 3 + velox/dwio/dwrf/test/TestColumnReader.cpp | 1 + .../parquet/reader/ParquetColumnReader.cpp | 10 +- .../dwio/parquet/reader/ParquetColumnReader.h | 4 +- velox/dwio/parquet/reader/ParquetReader.cpp | 8 +- .../parquet/reader/RepeatedColumnReader.cpp | 32 ++++- .../parquet/reader/RepeatedColumnReader.h | 8 +- .../parquet/reader/StructColumnReader.cpp | 51 ++++++- .../dwio/parquet/reader/StructColumnReader.h | 4 +- .../tests/reader/ParquetTableScanTest.cpp | 132 ++++++++++++++++++ velox/exec/tests/TableScanTest.cpp | 86 ++++++++++++ 23 files changed, 430 insertions(+), 65 deletions(-) diff --git a/velox/connectors/hive/SplitReader.cpp b/velox/connectors/hive/SplitReader.cpp index c3d8da10be1f..11cc8322dd5a 100644 --- a/velox/connectors/hive/SplitReader.cpp +++ b/velox/connectors/hive/SplitReader.cpp @@ -370,11 +370,17 @@ std::vector SplitReader::adaptColumns( auto fileTypeIdx = fileType->getChildIdxIfExists(fieldName); if (!fileTypeIdx.has_value()) { // Column is missing. Most likely due to schema evolution. - VELOX_CHECK(tableSchema, "Unable to resolve column '{}'", fieldName); + auto outputTypeIdx = readerOutputType_->getChildIdxIfExists(fieldName); + TypePtr fieldType; + if (outputTypeIdx.has_value()) { + // Field name exists in the user-specified output type. + fieldType = readerOutputType_->childAt(outputTypeIdx.value()); + } else { + VELOX_CHECK(tableSchema, "Unable to resolve column '{}'", fieldName); + fieldType = tableSchema->findChild(fieldName); + } childSpec->setConstantValue(BaseVector::createNullConstant( - tableSchema->findChild(fieldName), - 1, - connectorQueryCtx_->memoryPool())); + fieldType, 1, connectorQueryCtx_->memoryPool())); } else { // Column no longer missing, reset constant value set on the spec. childSpec->setConstantValue(nullptr); diff --git a/velox/dwio/common/ScanSpec.cpp b/velox/dwio/common/ScanSpec.cpp index fc247ba4dd2a..ee93fd0bd102 100644 --- a/velox/dwio/common/ScanSpec.cpp +++ b/velox/dwio/common/ScanSpec.cpp @@ -142,7 +142,7 @@ bool ScanSpec::hasFilter() const { if (hasFilter_.has_value()) { return hasFilter_.value(); } - if (!isConstant() && filter()) { + if (filter()) { hasFilter_ = true; return true; } diff --git a/velox/dwio/common/SelectiveStructColumnReader.cpp b/velox/dwio/common/SelectiveStructColumnReader.cpp index 4401e73977bb..d286fa27e38b 100644 --- a/velox/dwio/common/SelectiveStructColumnReader.cpp +++ b/velox/dwio/common/SelectiveStructColumnReader.cpp @@ -359,7 +359,6 @@ void SelectiveStructColumnReaderBase::read( } const auto& childSpecs = scanSpec_->children(); - VELOX_CHECK(!childSpecs.empty()); for (size_t i = 0; i < childSpecs.size(); ++i) { const auto& childSpec = childSpecs[i]; VELOX_TRACE_HISTORY_PUSH("read %s", childSpec->fieldName().c_str()); @@ -462,15 +461,17 @@ bool SelectiveStructColumnReaderBase::isChildMissing( // row type that doesn't exist // in the output. fileType_->type()->kind() != - TypeKind::MAP && // If this is the case it means this is a flat map, - // so it can't have "missing" fields. - childSpec.channel() >= fileType_->size()); + TypeKind::MAP // If this is the case it means this is a flat map, + // so it can't have "missing" fields. + ) && + (useColumnNames_ + ? !asRowType(fileType_->type())->containsChild(childSpec.fieldName()) + : childSpec.channel() >= fileType_->size()); } void SelectiveStructColumnReaderBase::getValues( const RowSet& rows, VectorPtr* result) { - VELOX_CHECK(!scanSpec_->children().empty()); VELOX_CHECK_NOT_NULL( *result, "SelectiveStructColumnReaderBase expects a non-null result"); VELOX_CHECK( diff --git a/velox/dwio/common/SelectiveStructColumnReader.h b/velox/dwio/common/SelectiveStructColumnReader.h index 8f258bc41da5..7d2a7ac6eb55 100644 --- a/velox/dwio/common/SelectiveStructColumnReader.h +++ b/velox/dwio/common/SelectiveStructColumnReader.h @@ -115,11 +115,13 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader { const std::shared_ptr& fileType, FormatParams& params, velox::common::ScanSpec& scanSpec, + bool useColumnNames, bool isRoot = false) : SelectiveColumnReader(requestedType, fileType, params, scanSpec), debugString_( getExceptionContext().message(VeloxException::Type::kSystem)), isRoot_(isRoot), + useColumnNames_(useColumnNames), rows_(memoryPool_) {} bool hasDeletion() const final { @@ -164,6 +166,9 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader { // table. const bool isRoot_; + // Whether to use names for mapping table field names to file field names. + const bool useColumnNames_; + // Dense set of rows to read in next(). raw_vector rows_; diff --git a/velox/dwio/dwrf/reader/DwrfReader.cpp b/velox/dwio/dwrf/reader/DwrfReader.cpp index 31d402d06b10..72fb486888a4 100644 --- a/velox/dwio/dwrf/reader/DwrfReader.cpp +++ b/velox/dwio/dwrf/reader/DwrfReader.cpp @@ -44,7 +44,8 @@ class DwrfUnit : public LoadUnit { uint32_t stripeIndex, std::shared_ptr columnSelector, const std::shared_ptr& projectedNodes, - RowReaderOptions options) + RowReaderOptions options, + bool useColumnNames) : stripeReaderBase_{stripeReaderBase}, strideIndexProvider_{strideIndexProvider}, columnReaderStatistics_{&columnReaderStatistics}, @@ -53,7 +54,8 @@ class DwrfUnit : public LoadUnit { projectedNodes_{projectedNodes}, options_{std::move(options)}, stripeInfo_{ - stripeReaderBase.getReader().footer().stripes(stripeIndex_)} {} + stripeReaderBase.getReader().footer().stripes(stripeIndex_)}, + useColumnNames_{useColumnNames} {} ~DwrfUnit() override = default; @@ -92,6 +94,9 @@ class DwrfUnit : public LoadUnit { const RowReaderOptions options_; const StripeInformationWrapper stripeInfo_; + // Whether to use names for mapping table field names to file field names. + const bool useColumnNames_; + // Mutables bool preloaded_; std::optional cachedIoSize_; @@ -166,6 +171,7 @@ void DwrfUnit::ensureDecoders() { streamLabels, *columnReaderStatistics_, scanSpec, + useColumnNames_, flatMapContext, /*isRoot=*/true); selectiveColumnReader_->setIsTopLevel(); @@ -328,7 +334,8 @@ std::unique_ptr DwrfRowReader::getUnitLoader() { stripe, columnSelector_, projectedNodes_, - options_)); + options_, + readerBaseShared()->readerOptions().useColumnNamesForColumnMapping())); } std::shared_ptr unitLoaderFactory = options_.unitLoaderFactory(); diff --git a/velox/dwio/dwrf/reader/SelectiveDwrfReader.cpp b/velox/dwio/dwrf/reader/SelectiveDwrfReader.cpp index 7826887a943a..3306d7917ac5 100644 --- a/velox/dwio/dwrf/reader/SelectiveDwrfReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveDwrfReader.cpp @@ -64,6 +64,7 @@ std::unique_ptr SelectiveDwrfReader::build( const std::shared_ptr& fileType, DwrfParams& params, common::ScanSpec& scanSpec, + bool useColumnNames, bool isRoot) { VELOX_CHECK( !isRoot || fileType->type()->kind() == TypeKind::ROW, @@ -90,16 +91,16 @@ std::unique_ptr SelectiveDwrfReader::build( requestedType, fileType, params, SHORT_BYTE_SIZE, scanSpec); case TypeKind::ARRAY: return std::make_unique( - requestedType, fileType, params, scanSpec); + requestedType, fileType, params, scanSpec, useColumnNames); case TypeKind::MAP: if (stripe.format() == DwrfFormat::kDwrf && stripe.getEncoding(ek).kind() == proto::ColumnEncoding_Kind_MAP_FLAT) { return createSelectiveFlatMapColumnReader( - requestedType, fileType, params, scanSpec); + requestedType, fileType, params, scanSpec, useColumnNames); } return std::make_unique( - requestedType, fileType, params, scanSpec); + requestedType, fileType, params, scanSpec, useColumnNames); case TypeKind::REAL: if (requestedType->kind() == TypeKind::REAL) { return std::make_unique< @@ -116,7 +117,7 @@ std::unique_ptr SelectiveDwrfReader::build( requestedType, fileType, params, scanSpec); case TypeKind::ROW: return std::make_unique( - requestedType, fileType, params, scanSpec, isRoot); + requestedType, fileType, params, scanSpec, useColumnNames, isRoot); case TypeKind::BOOLEAN: return std::make_unique( requestedType, fileType, params, scanSpec, true); diff --git a/velox/dwio/dwrf/reader/SelectiveDwrfReader.h b/velox/dwio/dwrf/reader/SelectiveDwrfReader.h index 8baa6644df68..a787a6a10e63 100644 --- a/velox/dwio/dwrf/reader/SelectiveDwrfReader.h +++ b/velox/dwio/dwrf/reader/SelectiveDwrfReader.h @@ -30,6 +30,7 @@ class SelectiveDwrfReader { const std::shared_ptr& fileType, DwrfParams& params, common::ScanSpec& scanSpec, + bool useColumnNames, bool isRoot = false); /// Compatibility wrapper for tests. Takes the components of DwrfParams as @@ -41,10 +42,12 @@ class SelectiveDwrfReader { const StreamLabels& streamLabels, dwio::common::ColumnReaderStatistics& stats, common::ScanSpec* scanSpec, + bool useColumnNames, FlatMapContext flatMapContext = {}, bool isRoot = false) { auto params = DwrfParams(stripe, streamLabels, stats, flatMapContext); - return build(requestedType, fileType, params, *scanSpec, isRoot); + return build( + requestedType, fileType, params, *scanSpec, useColumnNames, isRoot); } }; diff --git a/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp index 89ac39ec9d80..8afbf64e90b4 100644 --- a/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp @@ -71,7 +71,8 @@ std::vector> getKeyNodes( const std::shared_ptr& fileType, DwrfParams& params, common::ScanSpec& scanSpec, - bool asStruct) { + bool asStruct, + bool useColumnNames) { using namespace dwio::common::flatmap; std::vector> keyNodes; @@ -145,7 +146,11 @@ std::vector> getKeyNodes( .inMapDecoder = inMapDecoder.get(), .keySelectionCallback = nullptr}); auto reader = SelectiveDwrfReader::build( - requestedValueType, dataValueType, childParams, *childSpec); + requestedValueType, + dataValueType, + childParams, + *childSpec, + useColumnNames); keyNodes.emplace_back( key, sequence, std::move(reader), std::move(inMapDecoder)); }); @@ -169,14 +174,21 @@ class SelectiveFlatMapAsStructReader : public SelectiveStructColumnReaderBase { const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec) + common::ScanSpec& scanSpec, + bool useColumnNames) : SelectiveStructColumnReaderBase( requestedType, fileType, params, - scanSpec), - keyNodes_( - getKeyNodes(requestedType, fileType, params, scanSpec, true)) { + scanSpec, + useColumnNames), + keyNodes_(getKeyNodes( + requestedType, + fileType, + params, + scanSpec, + true, + useColumnNames)) { VELOX_CHECK( !keyNodes_.empty(), "For struct encoding, keys to project must be configured"); @@ -201,15 +213,23 @@ class SelectiveFlatMapReader : public SelectiveStructColumnReaderBase { const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec) + common::ScanSpec& scanSpec, + bool useColumnNames) : SelectiveStructColumnReaderBase( requestedType, fileType, params, - scanSpec), + scanSpec, + useColumnNames), flatMap_( *this, - getKeyNodes(requestedType, fileType, params, scanSpec, false)) {} + getKeyNodes( + requestedType, + fileType, + params, + scanSpec, + false, + useColumnNames)) {} void read(int64_t offset, const RowSet& rows, const uint64_t* incomingNulls) override { @@ -230,13 +250,14 @@ std::unique_ptr createReader( const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec) { + common::ScanSpec& scanSpec, + bool useColumnNames) { if (scanSpec.isFlatMapAsStruct()) { return std::make_unique>( - requestedType, fileType, params, scanSpec); + requestedType, fileType, params, scanSpec, useColumnNames); } else { return std::make_unique>( - requestedType, fileType, params, scanSpec); + requestedType, fileType, params, scanSpec, useColumnNames); } } @@ -247,21 +268,26 @@ createSelectiveFlatMapColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec) { + common::ScanSpec& scanSpec, + bool useColumnNames) { auto kind = fileType->childAt(0)->type()->kind(); switch (kind) { case TypeKind::TINYINT: - return createReader(requestedType, fileType, params, scanSpec); + return createReader( + requestedType, fileType, params, scanSpec, useColumnNames); case TypeKind::SMALLINT: - return createReader(requestedType, fileType, params, scanSpec); + return createReader( + requestedType, fileType, params, scanSpec, useColumnNames); case TypeKind::INTEGER: - return createReader(requestedType, fileType, params, scanSpec); + return createReader( + requestedType, fileType, params, scanSpec, useColumnNames); case TypeKind::BIGINT: - return createReader(requestedType, fileType, params, scanSpec); + return createReader( + requestedType, fileType, params, scanSpec, useColumnNames); case TypeKind::VARBINARY: case TypeKind::VARCHAR: return createReader( - requestedType, fileType, params, scanSpec); + requestedType, fileType, params, scanSpec, useColumnNames); default: VELOX_UNSUPPORTED("Not supported key type: {}", kind); } diff --git a/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.h b/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.h index 9a3c3898927d..e12936d16510 100644 --- a/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.h @@ -26,6 +26,7 @@ createSelectiveFlatMapColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams&, - common::ScanSpec&); + common::ScanSpec&, + bool useColumnNames); } // namespace facebook::velox::dwrf diff --git a/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.cpp index 8d2c0563f026..06e2a975393e 100644 --- a/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.cpp @@ -52,7 +52,8 @@ SelectiveListColumnReader::SelectiveListColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec) + common::ScanSpec& scanSpec, + bool useColumnNames) : dwio::common::SelectiveListColumnReader( requestedType, fileType, @@ -75,7 +76,11 @@ SelectiveListColumnReader::SelectiveListColumnReader( params.runtimeStatistics(), flatMapContextFromEncodingKey(encodingKey)); child_ = SelectiveDwrfReader::build( - childType, fileType_->childAt(0), childParams, *scanSpec_->children()[0]); + childType, + fileType_->childAt(0), + childParams, + *scanSpec_->children()[0], + useColumnNames); children_ = {child_.get()}; } @@ -83,7 +88,8 @@ SelectiveMapColumnReader::SelectiveMapColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec) + common::ScanSpec& scanSpec, + bool useColumnNames) : dwio::common::SelectiveMapColumnReader( requestedType, fileType, @@ -111,7 +117,8 @@ SelectiveMapColumnReader::SelectiveMapColumnReader( keyType, fileType_->childAt(0), keyParams, - *scanSpec_->children()[0].get()); + *scanSpec_->children()[0].get(), + useColumnNames); auto& valueType = requestedType_->childAt(1); auto elementParams = DwrfParams( @@ -123,7 +130,8 @@ SelectiveMapColumnReader::SelectiveMapColumnReader( valueType, fileType_->childAt(1), elementParams, - *scanSpec_->children()[1]); + *scanSpec_->children()[1], + useColumnNames); children_ = {keyReader_.get(), elementReader_.get()}; } diff --git a/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.h b/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.h index 9d89613009d1..7149190ae54c 100644 --- a/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.h @@ -31,7 +31,8 @@ class SelectiveListColumnReader const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec); + common::ScanSpec& scanSpec, + bool useColumnNames); void resetFilterCaches() override { child_->resetFilterCaches(); @@ -64,7 +65,8 @@ class SelectiveMapColumnReader : public dwio::common::SelectiveMapColumnReader { const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec); + common::ScanSpec& scanSpec, + bool useColumnNames); void resetFilterCaches() override { keyReader_->resetFilterCaches(); diff --git a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp index 418c8254c8b6..03874680e983 100644 --- a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp @@ -28,12 +28,14 @@ SelectiveStructColumnReader::SelectiveStructColumnReader( const std::shared_ptr& fileType, DwrfParams& params, common::ScanSpec& scanSpec, + bool useColumnNames, bool isRoot) : SelectiveStructColumnReaderBase( requestedType, fileType, params, scanSpec, + useColumnNames, isRoot) { EncodingKey encodingKey{fileType_->id(), params.flatMapContext().sequence}; auto& stripe = params.stripeStreams(); @@ -86,7 +88,11 @@ SelectiveStructColumnReader::SelectiveStructColumnReader( .inMapDecoder = nullptr, .keySelectionCallback = nullptr}); addChild(SelectiveDwrfReader::build( - childRequestedType, childFileType, childParams, *childSpec)); + childRequestedType, + childFileType, + childParams, + *childSpec, + useColumnNames)); childSpec->setSubscript(children_.size() - 1); } } diff --git a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.h b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.h index 59dc5bdc7622..4d28b692b379 100644 --- a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.h @@ -29,12 +29,14 @@ class SelectiveStructColumnReaderBase const std::shared_ptr& fileType, DwrfParams& params, common::ScanSpec& scanSpec, + bool useColumnNames, bool isRoot = false) : dwio::common::SelectiveStructColumnReaderBase( requestedType, fileType, params, scanSpec, + useColumnNames, isRoot), rowsPerRowGroup_(formatData_->rowsPerRowGroup().value()) { VELOX_CHECK_EQ(fileType_->id(), fileType->id(), "working on the same node"); @@ -85,6 +87,7 @@ class SelectiveStructColumnReader : public SelectiveStructColumnReaderBase { const std::shared_ptr& fileType, DwrfParams& params, common::ScanSpec& scanSpec, + bool useColumnNames, bool isRoot = false); private: diff --git a/velox/dwio/dwrf/test/TestColumnReader.cpp b/velox/dwio/dwrf/test/TestColumnReader.cpp index 944b43cebc3d..b0b730ed4cad 100644 --- a/velox/dwio/dwrf/test/TestColumnReader.cpp +++ b/velox/dwio/dwrf/test/TestColumnReader.cpp @@ -151,6 +151,7 @@ class ColumnReaderTestBase { labels_, columnReaderStatistics_, scanSpec, + /*useColumnNames=*/false, FlatMapContext{}); selectiveColumnReader_->setIsTopLevel(); columnReader_ = nullptr; diff --git a/velox/dwio/parquet/reader/ParquetColumnReader.cpp b/velox/dwio/parquet/reader/ParquetColumnReader.cpp index 8a2d50541943..2e6665dc39cd 100644 --- a/velox/dwio/parquet/reader/ParquetColumnReader.cpp +++ b/velox/dwio/parquet/reader/ParquetColumnReader.cpp @@ -37,7 +37,9 @@ std::unique_ptr ParquetColumnReader::build( const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec) { + common::ScanSpec& scanSpec, + memory::MemoryPool& pool, + bool useColumnNames) { auto colName = scanSpec.fieldName(); switch (fileType->type()->kind()) { @@ -58,7 +60,7 @@ std::unique_ptr ParquetColumnReader::build( case TypeKind::ROW: return std::make_unique( - requestedType, fileType, params, scanSpec); + requestedType, fileType, params, scanSpec, pool, useColumnNames); case TypeKind::VARBINARY: case TypeKind::VARCHAR: @@ -66,11 +68,11 @@ std::unique_ptr ParquetColumnReader::build( case TypeKind::ARRAY: return std::make_unique( - requestedType, fileType, params, scanSpec); + requestedType, fileType, params, scanSpec, pool, useColumnNames); case TypeKind::MAP: return std::make_unique( - requestedType, fileType, params, scanSpec); + requestedType, fileType, params, scanSpec, pool, useColumnNames); case TypeKind::BOOLEAN: return std::make_unique( diff --git a/velox/dwio/parquet/reader/ParquetColumnReader.h b/velox/dwio/parquet/reader/ParquetColumnReader.h index 8ff086029472..363e0c0b6768 100644 --- a/velox/dwio/parquet/reader/ParquetColumnReader.h +++ b/velox/dwio/parquet/reader/ParquetColumnReader.h @@ -45,6 +45,8 @@ class ParquetColumnReader { const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec); + common::ScanSpec& scanSpec, + memory::MemoryPool& pool, + bool useColumnNames); }; } // namespace facebook::velox::parquet diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index ffcf7c694563..7a90be1879ad 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -79,6 +79,10 @@ class ReaderBase { return options_.fileColumnNamesReadAsLowerCase(); } + bool useColumnNamesForColumnMapping() const { + return options_.useColumnNamesForColumnMapping(); + } + const tz::TimeZone* sessionTimezone() const { return options_.sessionTimezone(); } @@ -1065,7 +1069,9 @@ class ParquetRowReader::Impl { requestedType_, readerBase_->schemaWithId(), // Id is schema id params, - *options_.scanSpec()); + *options_.scanSpec(), + pool_, + readerBase_->useColumnNamesForColumnMapping()); columnReader_->setIsTopLevel(); filterRowGroups(); diff --git a/velox/dwio/parquet/reader/RepeatedColumnReader.cpp b/velox/dwio/parquet/reader/RepeatedColumnReader.cpp index fe7dce21ad48..1c7df63610c0 100644 --- a/velox/dwio/parquet/reader/RepeatedColumnReader.cpp +++ b/velox/dwio/parquet/reader/RepeatedColumnReader.cpp @@ -33,6 +33,9 @@ PageReader* readLeafRepDefs( return nullptr; } auto pageReader = reader->formatData().as().reader(); + if (pageReader == nullptr) { + return nullptr; + } pageReader->decodeRepDefs(numTop); return pageReader; } @@ -113,7 +116,9 @@ MapColumnReader::MapColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec) + common::ScanSpec& scanSpec, + memory::MemoryPool& pool, + bool useColumnNames) : dwio::common::SelectiveMapColumnReader( requestedType, fileType, @@ -123,9 +128,19 @@ MapColumnReader::MapColumnReader( auto& keyChildType = requestedType->childAt(0); auto& elementChildType = requestedType->childAt(1); keyReader_ = ParquetColumnReader::build( - keyChildType, fileType_->childAt(0), params, *scanSpec.children()[0]); + keyChildType, + fileType_->childAt(0), + params, + *scanSpec.children()[0], + pool, + useColumnNames); elementReader_ = ParquetColumnReader::build( - elementChildType, fileType_->childAt(1), params, *scanSpec.children()[1]); + elementChildType, + fileType_->childAt(1), + params, + *scanSpec.children()[1], + pool, + useColumnNames); reinterpret_cast(fileType.get()) ->makeLevelInfo(levelInfo_); children_ = {keyReader_.get(), elementReader_.get()}; @@ -222,7 +237,9 @@ ListColumnReader::ListColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec) + common::ScanSpec& scanSpec, + memory::MemoryPool& pool, + bool useColumnNames) : dwio::common::SelectiveListColumnReader( requestedType, fileType, @@ -230,7 +247,12 @@ ListColumnReader::ListColumnReader( scanSpec) { auto& childType = requestedType->childAt(0); child_ = ParquetColumnReader::build( - childType, fileType_->childAt(0), params, *scanSpec.children()[0]); + childType, + fileType_->childAt(0), + params, + *scanSpec.children()[0], + pool, + useColumnNames); reinterpret_cast(fileType.get()) ->makeLevelInfo(levelInfo_); children_ = {child_.get()}; diff --git a/velox/dwio/parquet/reader/RepeatedColumnReader.h b/velox/dwio/parquet/reader/RepeatedColumnReader.h index ecaa1d6522ba..c731f021c446 100644 --- a/velox/dwio/parquet/reader/RepeatedColumnReader.h +++ b/velox/dwio/parquet/reader/RepeatedColumnReader.h @@ -59,7 +59,9 @@ class MapColumnReader : public dwio::common::SelectiveMapColumnReader { const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec); + common::ScanSpec& scanSpec, + memory::MemoryPool& pool, + bool useColumnNames); void prepareRead( vector_size_t offset, @@ -115,7 +117,9 @@ class ListColumnReader : public dwio::common::SelectiveListColumnReader { const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec); + common::ScanSpec& scanSpec, + memory::MemoryPool& pool, + bool useColumnNames); void prepareRead( vector_size_t offset, diff --git a/velox/dwio/parquet/reader/StructColumnReader.cpp b/velox/dwio/parquet/reader/StructColumnReader.cpp index 28267a64367a..f8242fd2d1ed 100644 --- a/velox/dwio/parquet/reader/StructColumnReader.cpp +++ b/velox/dwio/parquet/reader/StructColumnReader.cpp @@ -30,26 +30,63 @@ StructColumnReader::StructColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec) - : SelectiveStructColumnReader(requestedType, fileType, params, scanSpec) { + common::ScanSpec& scanSpec, + memory::MemoryPool& pool, + bool useColumnNames) + : SelectiveStructColumnReader( + requestedType, + fileType, + params, + scanSpec, + useColumnNames, + /*isRoot=*/false) { auto& childSpecs = scanSpec_->stableChildren(); + std::vector missingFields; for (auto i = 0; i < childSpecs.size(); ++i) { auto childSpec = childSpecs[i]; - if (childSpec->isConstant() || isChildMissing(*childSpec)) { + if (childSpec->isConstant() && + (!useColumnNames && isChildMissing(*childSpec))) { childSpec->setSubscript(kConstantChildSpecSubscript); continue; } if (!childSpecs[i]->readFromFile()) { continue; } + if (useColumnNames && isChildMissing(*childSpec)) { + missingFields.emplace_back(i); + continue; + } auto childFileType = fileType_->childByName(childSpec->fieldName()); auto childRequestedType = requestedType_->asRow().findChild(childSpec->fieldName()); addChild(ParquetColumnReader::build( - childRequestedType, childFileType, params, *childSpec)); + childRequestedType, + childFileType, + params, + *childSpec, + pool, + useColumnNames)); childSpecs[i]->setSubscript(children_.size() - 1); } + + // missingFields is not empty only when using useColumnNames = true. + if (missingFields.size() > 0) { + // Set the struct as null if all the subfields in the requested type are + // missing and the number of subfields is more than one. + if (childSpecs.size() > 1 && missingFields.size() == childSpecs.size()) { + scanSpec_->setConstantValue( + BaseVector::createNullConstant(requestedType_, 1, &pool)); + } else { + // Set null constant for the missing subfield of requested type. + auto rowTypePtr = asRowType(requestedType_); + for (int channel : missingFields) { + childSpecs[channel]->setConstantValue(BaseVector::createNullConstant( + rowTypePtr->findChild(childSpecs[channel]->fieldName()), 1, &pool)); + } + } + } + auto type = reinterpret_cast(fileType_.get()); if (type->parent()) { levelMode_ = reinterpret_cast(fileType_.get()) @@ -59,7 +96,10 @@ StructColumnReader::StructColumnReader( // this and the child. auto child = childForRepDefs_; for (;;) { - assert(child); + if (child == nullptr) { + levelMode_ = LevelMode::kNulls; + break; + } if (child->fileType().type()->kind() == TypeKind::ARRAY || child->fileType().type()->kind() == TypeKind::MAP) { levelMode_ = LevelMode::kStructOverLists; @@ -96,7 +136,6 @@ StructColumnReader::findBestLeaf() { best = child; } } - assert(best); return best; } diff --git a/velox/dwio/parquet/reader/StructColumnReader.h b/velox/dwio/parquet/reader/StructColumnReader.h index 03fd46092644..0a21652ea806 100644 --- a/velox/dwio/parquet/reader/StructColumnReader.h +++ b/velox/dwio/parquet/reader/StructColumnReader.h @@ -35,7 +35,9 @@ class StructColumnReader : public dwio::common::SelectiveStructColumnReader { const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec); + common::ScanSpec& scanSpec, + memory::MemoryPool& pool, + bool useColumnNames); void read(int64_t offset, const RowSet& rows, const uint64_t* incomingNulls) override; diff --git a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp index c272fa43a8bc..e8140e6c07f0 100644 --- a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp @@ -295,6 +295,11 @@ class ParquetTableScanTest : public HiveConnectorTestBase { "SELECT t from tmp where t == TIMESTAMP '2022-12-23 03:56:01'"); } + const std::vector>& splits() + const { + return splits_; + } + private: RowTypePtr getRowType(std::vector&& outputColumnNames) const { std::vector types; @@ -1167,6 +1172,133 @@ TEST_F(ParquetTableScanTest, schemaMatch) { assertEqualVectors(rows->childAt(2), nullVector); } +TEST_F(ParquetTableScanTest, structMatchByName) { + const auto assertSelectUseColumnNames = + [this]( + const RowTypePtr& outputType, + const std::string& sql, + const std::string& remainingFilter = "") { + const auto plan = + PlanBuilder().tableScan(outputType, {}, remainingFilter).planNode(); + AssertQueryBuilder(plan, duckDbQueryRunner_) + .connectorSessionProperty( + kHiveConnectorId, + HiveConfig::kParquetUseColumnNamesSession, + "true") + .splits(splits()) + .assertResults(sql); + }; + + std::vector values = {2}; + const auto id = makeFlatVector(values); + const auto name = makeRowVector( + {"first", "last"}, + { + makeFlatVector({"Janet"}), + makeFlatVector({"Jones"}), + }); + const auto address = makeFlatVector({"567 Maple Drive"}); + auto vector = makeRowVector({"id", "name", "address"}, {id, name, address}); + + WriterOptions options; + auto file = TempFilePath::create(); + writeToParquetFile(file->getPath(), {vector}, options); + + loadData(file->getPath(), asRowType(vector->type()), vector); + assertSelect({"id", "name", "address"}, "SELECT id, name, address from tmp"); + + // Add one non-existing subfield 'middle' to the 'name' field and rename filed + // 'address'. + auto rowType = + ROW({"id", "name", "email"}, + {BIGINT(), + ROW({"first", "middle", "last"}, {VARCHAR(), VARCHAR(), VARCHAR()}), + VARCHAR()}); + loadData(file->getPath(), rowType, vector); + assertSelectUseColumnNames( + rowType, "SELECT 2, ('Janet', null, 'Jones'), null"); + + // Filter pushdown on the non-existing field. + assertSelectUseColumnNames( + rowType, "SELECT * from tmp where false", "not(is_null(name.middle))"); + + // Rename subfields of the 'name' field. + rowType = + ROW({"id", "name", "address"}, + {BIGINT(), ROW({"a", "b"}, {VARCHAR(), VARCHAR()}), VARCHAR()}); + loadData(file->getPath(), rowType, vector); + assertSelectUseColumnNames(rowType, "SELECT 2, null, '567 Maple Drive'"); + + // Filter pushdown on the NULL subfield. + assertSelectUseColumnNames( + rowType, "SELECT * from tmp where false", "not(is_null(name))"); + + // Deletion of one subfield from the 'name' field. + rowType = + ROW({"id", "name", "address"}, + {BIGINT(), ROW({"full"}, {VARCHAR()}), VARCHAR()}); + loadData(file->getPath(), rowType, vector); + assertSelectUseColumnNames(rowType, "SELECT 2, row(null), '567 Maple Drive'"); + + // Filter pushdown on the non-existing subfield. + assertSelectUseColumnNames( + rowType, "SELECT * from tmp where false", "not(is_null(name.full))"); + + // No subfield in the 'name' field. + rowType = ROW({"id", "name", "address"}, {BIGINT(), ROW({}, {}), VARCHAR()}); + const auto op = PlanBuilder() + .startTableScan() + .outputType(rowType) + .dataColumns(rowType) + .endTableScan() + .planNode(); + const auto split = makeSplit(file->getPath()); + const auto result = AssertQueryBuilder(op) + .connectorSessionProperty( + kHiveConnectorId, + HiveConfig::kParquetUseColumnNamesSession, + "true") + .split(split) + .copyResults(pool()); + const auto rows = result->as(); + const auto expected = makeRowVector(ROW({}, {}), 1); + assertEqualVectors(expected, rows->childAt(1)); + + // Case sensitivity when matching by name. + vector = makeRowVector( + {"id", "name", "address"}, + {id, + makeRowVector( + {"FIRST", "LAST"}, + { + makeFlatVector({"Janet"}), + makeFlatVector({"Jones"}), + }), + address}); + file = TempFilePath::create(); + writeToParquetFile(file->getPath(), {vector}, options); + + rowType = + ROW({"id", "name", "address"}, + {BIGINT(), + ROW({"first", "middle", "last"}, {VARCHAR(), VARCHAR(), VARCHAR()}), + VARCHAR()}); + loadData(file->getPath(), rowType, vector); + assertSelectUseColumnNames(rowType, "SELECT 2, null, '567 Maple Drive'"); + + // Case insensitivity when matching by name and reading as lower case. + auto plan = PlanBuilder().tableScan(rowType, {}, "", rowType).planNode(); + AssertQueryBuilder(plan, duckDbQueryRunner_) + .connectorSessionProperty( + kHiveConnectorId, HiveConfig::kParquetUseColumnNamesSession, "true") + .connectorSessionProperty( + kHiveConnectorId, + HiveConfig::kFileColumnNamesReadAsLowerCaseSession, + "true") + .splits(splits()) + .assertResults("SELECT 2, ('Janet', null, 'Jones'), '567 Maple Drive'"); +} + TEST_F(ParquetTableScanTest, deltaByteArray) { auto a = makeFlatVector({"axis", "axle", "babble", "babyhood"}); auto expected = makeRowVector({"a"}, {a}); diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp index 3a1cbcadca0a..749f47d6be54 100644 --- a/velox/exec/tests/TableScanTest.cpp +++ b/velox/exec/tests/TableScanTest.cpp @@ -1392,6 +1392,92 @@ TEST_F(TableScanTest, missingColumnsInRepeatedColumns) { .assertResults(expected); } +TEST_F(TableScanTest, structMatchByName) { + const auto assertSelectUseColumnNames = + [this]( + const RowTypePtr& outputType, + const std::string& sql, + const std::string& filePath, + const std::string& remainingFilter = "") { + const auto plan = + PlanBuilder().tableScan(outputType, {}, remainingFilter).planNode(); + AssertQueryBuilder(plan, duckDbQueryRunner_) + .connectorSessionProperty( + kHiveConnectorId, + connector::hive::HiveConfig::kOrcUseColumnNamesSession, + "true") + .split(makeHiveConnectorSplit(filePath)) + .assertResults(sql); + }; + + std::vector values = {2}; + const auto id = makeFlatVector(values); + const auto name = makeRowVector( + {"first", "last"}, + { + makeFlatVector({"Janet"}), + makeFlatVector({"Jones"}), + }); + const auto address = makeFlatVector({"567 Maple Drive"}); + auto vector = makeRowVector({"id", "name", "address"}, {id, name, address}); + + auto file = TempFilePath::create(); + writeToFile(file->getPath(), {vector}); + + // Add one non-existing subfield 'middle' to the 'name' field and rename filed + // 'address'. + auto rowType = + ROW({"id", "name", "email"}, + {BIGINT(), + ROW({"first", "middle", "last"}, {VARCHAR(), VARCHAR(), VARCHAR()}), + VARCHAR()}); + assertSelectUseColumnNames( + rowType, "SELECT 2, ('Janet', null, 'Jones'), null", file->getPath()); + + // Filter pushdown on the non-existing field. + createDuckDbTable({vector}); + assertSelectUseColumnNames( + rowType, + "SELECT * from tmp where false", + file->getPath(), + "not(is_null(name.middle))"); + + // Deletion of one subfield from the 'name' field. + rowType = + ROW({"id", "name", "address"}, + {BIGINT(), ROW({"full"}, {VARCHAR()}), VARCHAR()}); + assertSelectUseColumnNames( + rowType, "SELECT 2, row(null), '567 Maple Drive'", file->getPath()); + + // Filter pushdown on the non-existing subfield. + assertSelectUseColumnNames( + rowType, + "SELECT * from tmp where false", + file->getPath(), + "not(is_null(name.full))"); + + // No subfield in the 'name' field. + rowType = ROW({"id", "name", "address"}, {BIGINT(), ROW({}, {}), VARCHAR()}); + const auto op = PlanBuilder() + .startTableScan() + .outputType(rowType) + .dataColumns(rowType) + .endTableScan() + .planNode(); + const auto split = makeHiveConnectorSplit(file->getPath()); + const auto result = + AssertQueryBuilder(op) + .connectorSessionProperty( + kHiveConnectorId, + connector::hive::HiveConfig::kOrcUseColumnNamesSession, + "true") + .split(split) + .copyResults(pool()); + const auto rows = result->as(); + const auto expected = makeRowVector(ROW({}, {}), 1); + facebook::velox::test::assertEqualVectors(expected, rows->childAt(1)); +} + // Tests queries that use Lazy vectors with multiple layers of wrapping. TEST_F(TableScanTest, constDictLazy) { vector_size_t size = 1'000; From 15a4ca9b55a19ca74760cac37bbb356c9631733b Mon Sep 17 00:00:00 2001 From: "joey.ljy" Date: Mon, 7 Aug 2023 18:37:06 +0800 Subject: [PATCH 02/10] [6020 ] Spark sql avg agg function support decimal fix decimal avg function precision issue --- .../lib/aggregates/AverageAggregateBase.cpp | 3 +- .../lib/aggregates/DecimalAggregate.h | 8 +- .../sparksql/aggregates/AverageAggregate.cpp | 389 +++++++++++++++++- .../tests/AverageAggregationTest.cpp | 147 +++++++ 4 files changed, 523 insertions(+), 24 deletions(-) diff --git a/velox/functions/lib/aggregates/AverageAggregateBase.cpp b/velox/functions/lib/aggregates/AverageAggregateBase.cpp index 047d48cbf270..52e04dceccff 100644 --- a/velox/functions/lib/aggregates/AverageAggregateBase.cpp +++ b/velox/functions/lib/aggregates/AverageAggregateBase.cpp @@ -21,7 +21,8 @@ namespace facebook::velox::functions::aggregate { void checkAvgIntermediateType(const TypePtr& type) { VELOX_USER_CHECK( type->isRow() || type->isVarbinary(), - "Input type for final average must be row type or varbinary type."); + "Input type for final average must be row type or varbinary type, find {}", + type->toString()); if (type->kind() == TypeKind::VARBINARY) { return; } diff --git a/velox/functions/lib/aggregates/DecimalAggregate.h b/velox/functions/lib/aggregates/DecimalAggregate.h index 8e78a545d70c..58b534cda608 100644 --- a/velox/functions/lib/aggregates/DecimalAggregate.h +++ b/velox/functions/lib/aggregates/DecimalAggregate.h @@ -78,7 +78,7 @@ class DecimalAggregate : public exec::Aggregate { } int32_t accumulatorAlignmentSize() const override { - return static_cast(sizeof(int128_t)); + return alignof(LongDecimalWithOverflowState); } void addRawInput( @@ -275,7 +275,9 @@ class DecimalAggregate : public exec::Aggregate { } virtual TResultType computeFinalValue( - LongDecimalWithOverflowState* accumulator) = 0; + LongDecimalWithOverflowState* accumulator) { + return 0; + }; void extractValues(char** groups, int32_t numGroups, VectorPtr* result) override { @@ -327,11 +329,11 @@ class DecimalAggregate : public exec::Aggregate { } } - private: inline LongDecimalWithOverflowState* decimalAccumulator(char* group) { return exec::Aggregate::value(group); } + private: DecodedVector decodedRaw_; DecodedVector decodedPartial_; }; diff --git a/velox/functions/sparksql/aggregates/AverageAggregate.cpp b/velox/functions/sparksql/aggregates/AverageAggregate.cpp index 623657df3a79..1c81de6e6b59 100644 --- a/velox/functions/sparksql/aggregates/AverageAggregate.cpp +++ b/velox/functions/sparksql/aggregates/AverageAggregate.cpp @@ -16,6 +16,7 @@ #include "velox/functions/sparksql/aggregates/AverageAggregate.h" #include "velox/functions/lib/aggregates/AverageAggregateBase.h" +#include "velox/functions/sparksql/DecimalUtil.h" using namespace facebook::velox::functions::aggregate; @@ -74,6 +75,308 @@ class AverageAggregate } }; +template +class DecimalAverageAggregate : public DecimalAggregate { + public: + explicit DecimalAverageAggregate(TypePtr resultType, TypePtr sumType) + : DecimalAggregate(resultType), sumType_(sumType) {} + + void addIntermediateResults( + char** groups, + const SelectivityVector& rows, + const std::vector& args, + bool /* mayPushdown */) override { + decodedPartial_.decode(*args[0], rows); + auto baseRowVector = dynamic_cast(decodedPartial_.base()); + auto sumVector = baseRowVector->childAt(0)->as>(); + auto countVector = baseRowVector->childAt(1)->as>(); + VELOX_USER_CHECK_NOT_NULL(sumVector); + + if (decodedPartial_.isConstantMapping()) { + if (!decodedPartial_.isNullAt(0)) { + auto decodedIndex = decodedPartial_.index(0); + auto count = countVector->valueAt(decodedIndex); + if (sumVector->isNullAt(decodedIndex) && + !countVector->isNullAt(decodedIndex) && count > 0) { + // Find overflow, set all groups to null. + rows.applyToSelected( + [&](vector_size_t i) { this->setNull(groups[i]); }); + } else { + auto sum = sumVector->valueAt(decodedIndex); + rows.applyToSelected([&](vector_size_t i) { + this->clearNull(groups[i]); + auto accumulator = this->decimalAccumulator(groups[i]); + mergeSumCount(accumulator, sum, count); + }); + } + } + } else if (decodedPartial_.mayHaveNulls()) { + rows.applyToSelected([&](vector_size_t i) { + if (decodedPartial_.isNullAt(i)) { + return; + } + auto decodedIndex = decodedPartial_.index(i); + auto count = countVector->valueAt(decodedIndex); + if (sumVector->isNullAt(decodedIndex) && + !countVector->isNullAt(decodedIndex) && count > 0) { + this->setNull(groups[i]); + } else { + this->clearNull(groups[i]); + auto sum = sumVector->valueAt(decodedIndex); + auto accumulator = this->decimalAccumulator(groups[i]); + mergeSumCount(accumulator, sum, count); + } + }); + } else { + rows.applyToSelected([&](vector_size_t i) { + auto decodedIndex = decodedPartial_.index(i); + auto count = countVector->valueAt(decodedIndex); + if (sumVector->isNullAt(decodedIndex) && + !countVector->isNullAt(decodedIndex) && count > 0) { + this->setNull(groups[i]); + } else { + this->clearNull(groups[i]); + auto sum = sumVector->valueAt(decodedIndex); + auto accumulator = this->decimalAccumulator(groups[i]); + mergeSumCount(accumulator, sum, count); + } + }); + } + } + + void addSingleGroupIntermediateResults( + char* group, + const SelectivityVector& rows, + const std::vector& args, + bool /* mayPushdown */) override { + decodedPartial_.decode(*args[0], rows); + auto baseRowVector = dynamic_cast(decodedPartial_.base()); + auto sumVector = baseRowVector->childAt(0)->as>(); + auto countVector = baseRowVector->childAt(1)->as>(); + + if (decodedPartial_.isConstantMapping()) { + if (!decodedPartial_.isNullAt(0)) { + auto decodedIndex = decodedPartial_.index(0); + if (isPartialSumOverflow(sumVector, countVector, decodedIndex)) { + // Find overflow, just set group to null and return. + this->setNull(group); + return; + } else { + if (rows.hasSelections()) { + this->clearNull(group); + } + auto sum = sumVector->valueAt(decodedIndex); + auto count = countVector->valueAt(decodedIndex); + rows.applyToSelected( + [&](vector_size_t i) { mergeAccumulators(group, sum, count); }); + } + } + } else if (decodedPartial_.mayHaveNulls()) { + rows.applyToSelected([&](vector_size_t i) { + if (!decodedPartial_.isNullAt(i)) { + this->clearNull(group); + auto decodedIndex = decodedPartial_.index(i); + if (isPartialSumOverflow(sumVector, countVector, decodedIndex)) { + // Find overflow, just set group to null. + this->setNull(group); + } else { + auto sum = sumVector->valueAt(decodedIndex); + auto count = countVector->valueAt(decodedIndex); + mergeAccumulators(group, sum, count); + } + } + }); + } else { + if (rows.hasSelections()) { + this->clearNull(group); + } + rows.applyToSelected([&](vector_size_t i) { + auto decodedIndex = decodedPartial_.index(i); + if (isPartialSumOverflow(sumVector, countVector, decodedIndex)) { + // Find overflow, just set group to null. + this->setNull(group); + } else { + auto sum = sumVector->valueAt(decodedIndex); + auto count = countVector->valueAt(decodedIndex); + mergeAccumulators(group, sum, count); + } + }); + } + } + + void extractAccumulators(char** groups, int32_t numGroups, VectorPtr* result) + override { + auto rowVector = (*result)->as(); + auto sumVector = rowVector->childAt(0)->asFlatVector(); + auto countVector = rowVector->childAt(1)->asFlatVector(); + VELOX_USER_CHECK_NOT_NULL(sumVector); + + rowVector->resize(numGroups); + sumVector->resize(numGroups); + countVector->resize(numGroups); + rowVector->clearAllNulls(); + + int64_t* rawCounts = countVector->mutableRawValues(); + int128_t* rawSums = sumVector->mutableRawValues(); + for (auto i = 0; i < numGroups; ++i) { + char* group = groups[i]; + auto* accumulator = this->decimalAccumulator(group); + std::optional adjustedSum = DecimalUtil::adjustSumForOverflow( + accumulator->sum, accumulator->overflow); + if (adjustedSum.has_value()) { + rawCounts[i] = accumulator->count; + rawSums[i] = adjustedSum.value(); + } else { + // Find overflow. + sumVector->setNull(i, true); + rawCounts[i] = accumulator->count; + } + } + } + + void extractValues(char** groups, int32_t numGroups, VectorPtr* result) + override { + auto vector = (*result)->as>(); + VELOX_CHECK(vector); + vector->resize(numGroups); + uint64_t* rawNulls = this->getRawNulls(vector); + + TResultType* rawValues = vector->mutableRawValues(); + for (int32_t i = 0; i < numGroups; ++i) { + char* group = groups[i]; + auto accumulator = this->decimalAccumulator(group); + if (accumulator->count == 0) { + // In Spark, if all inputs are null, count will be 0, + // and the result of final avg will be null. + vector->setNull(i, true); + } else { + this->clearNull(rawNulls, i); + std::optional avg = computeAvg(accumulator); + if (avg.has_value()) { + rawValues[i] = avg.value(); + } else { + // Find overflow. + vector->setNull(i, true); + } + } + } + } + + std::optional computeAvg( + LongDecimalWithOverflowState* accumulator) { + std::optional validSum = DecimalUtil::adjustSumForOverflow( + accumulator->sum, accumulator->overflow); + if (!validSum.has_value()) { + return std::nullopt; + } + + auto [resultPrecision, resultScale] = + getDecimalPrecisionScale(*this->resultType().get()); + // Spark use DECIMAL(20,0) to represent long value. + const uint8_t countPrecision = 20, countScale = 0; + auto [sumPrecision, sumScale] = + getDecimalPrecisionScale(*this->sumType_.get()); + auto [avgPrecision, avgScale] = computeResultPrecisionScale( + sumPrecision, sumScale, countPrecision, countScale); + avgScale = std::max(avgScale, resultScale); + auto sumRescale = computeRescaleFactor(sumScale, countScale, avgScale); + auto countDecimal = accumulator->count; + int128_t avg = 0; + + bool overflow = false; + functions::sparksql::DecimalUtil:: + divideWithRoundUp( + avg, validSum.value(), countDecimal, sumRescale, overflow); + if (overflow) { + return std::nullopt; + } + TResultType rescaledValue; + const auto status = DecimalUtil::rescaleWithRoundUp( + avg, + avgPrecision, + avgScale, + resultPrecision, + resultScale, + rescaledValue); + return status.ok() ? std::optional(rescaledValue) + : std::nullopt; + } + + private: + template + inline void mergeSumCount( + LongDecimalWithOverflowState* accumulator, + UnscaledType sum, + int64_t count) { + accumulator->count += count; + accumulator->overflow += + DecimalUtil::addWithOverflow(accumulator->sum, sum, accumulator->sum); + } + + template + void mergeAccumulators( + char* group, + const UnscaledType& otherSum, + const int64_t& otherCount) { + if constexpr (tableHasNulls) { + exec::Aggregate::clearNull(group); + } + auto accumulator = this->decimalAccumulator(group); + mergeSumCount(accumulator, otherSum, otherCount); + } + + inline static bool isPartialSumOverflow( + SimpleVector* sumVector, + SimpleVector* countVector, + int32_t index) { + return sumVector->isNullAt(index) && !countVector->isNullAt(index) && + countVector->valueAt(index) > 0; + } + + inline static uint8_t + computeRescaleFactor(uint8_t fromScale, uint8_t toScale, uint8_t rScale) { + return rScale - fromScale + toScale; + } + + inline static std::pair computeResultPrecisionScale( + const uint8_t aPrecision, + const uint8_t aScale, + const uint8_t bPrecision, + const uint8_t bScale) { + uint8_t intDig = aPrecision - aScale + bScale; + uint8_t scale = std::max(6, aScale + bPrecision + 1); + uint8_t precision = intDig + scale; + return functions::sparksql::DecimalUtil::adjustPrecisionScale( + precision, scale); + } + + inline static std::pair adjustPrecisionScale( + const uint8_t precision, + const uint8_t scale) { + VELOX_CHECK(precision >= scale); + if (precision <= 38) { + return {precision, scale}; + } else { + uint8_t intDigits = precision - scale; + uint8_t minScaleValue = std::min(scale, (uint8_t)6); + uint8_t adjustedScale = + std::max((uint8_t)(38 - intDigits), minScaleValue); + return {38, adjustedScale}; + } + } + + DecodedVector decodedRaw_; + DecodedVector decodedPartial_; + TypePtr sumType_; +}; + +TypePtr getDecimalSumType( + const uint8_t rawInputPrecision, + const uint8_t rawInputScale) { + // This computational logic is derived from the definition of Spark SQL. + return DECIMAL(std::min(38, rawInputPrecision + 10), rawInputScale); +} + } // namespace /// Count is BIGINT() while sum and the final aggregates type depends on @@ -99,13 +402,25 @@ exec::AggregateRegistrationResult registerAverage( .build()); } - signatures.push_back(exec::AggregateFunctionSignatureBuilder() - .integerVariable("a_precision") - .integerVariable("a_scale") - .argumentType("DECIMAL(a_precision, a_scale)") - .intermediateType("varbinary") - .returnType("DECIMAL(a_precision, a_scale)") - .build()); + signatures.push_back( + exec::AggregateFunctionSignatureBuilder() + .integerVariable("a_precision") + .integerVariable("a_scale") + .integerVariable("r_precision", "min(38, a_precision + 4)") + .integerVariable("r_scale", "min(38, a_scale + 4)") + .argumentType("DECIMAL(a_precision, a_scale)") + .intermediateType("ROW(DECIMAL(38, a_scale), BIGINT)") + .returnType("DECIMAL(r_precision, r_scale)") + .build()); + + signatures.push_back( + exec::AggregateFunctionSignatureBuilder() + .integerVariable("a_precision") + .integerVariable("a_scale") + .argumentType("DECIMAL(a_precision, a_scale)") + .intermediateType("ROW(DECIMAL(a_precision, a_scale), BIGINT)") + .returnType("DECIMAL(a_precision, a_scale)") + .build()); return exec::registerAggregateFunction( name, @@ -118,7 +433,7 @@ exec::AggregateRegistrationResult registerAverage( -> std::unique_ptr { VELOX_CHECK_LE( argTypes.size(), 1, "{} takes at most one argument", name); - auto inputType = argTypes[0]; + const auto& inputType = argTypes[0]; if (exec::isRawInput(step)) { switch (inputType->kind()) { case TypeKind::SMALLINT: @@ -129,16 +444,39 @@ exec::AggregateRegistrationResult registerAverage( AverageAggregate>(resultType); case TypeKind::BIGINT: { if (inputType->isShortDecimal()) { - return std::make_unique>( - resultType); + auto inputPrecision = inputType->asShortDecimal().precision(); + auto inputScale = inputType->asShortDecimal().scale(); + auto sumType = + DECIMAL(std::min(38, inputPrecision + 10), inputScale); + if (exec::isPartialOutput(step)) { + return std::make_unique< + DecimalAverageAggregate>( + resultType, sumType); + } else { + if (resultType->isShortDecimal()) { + return std::make_unique< + DecimalAverageAggregate>( + resultType, sumType); + } else if (resultType->isLongDecimal()) { + return std::make_unique< + DecimalAverageAggregate>( + resultType, sumType); + } else { + VELOX_FAIL("Result type must be decimal"); + } + } } return std::make_unique< AverageAggregate>(resultType); } case TypeKind::HUGEINT: { if (inputType->isLongDecimal()) { - return std::make_unique>( - resultType); + auto inputPrecision = inputType->asLongDecimal().precision(); + auto inputScale = inputType->asLongDecimal().scale(); + auto sumType = getDecimalSumType(inputPrecision, inputScale); + return std::make_unique< + DecimalAverageAggregate>( + resultType, sumType); } VELOX_NYI(); } @@ -162,26 +500,37 @@ exec::AggregateRegistrationResult registerAverage( resultType); case TypeKind::DOUBLE: case TypeKind::ROW: + if (inputType->childAt(0)->isLongDecimal()) { + return std::make_unique< + DecimalAverageAggregate>( + resultType, inputType->childAt(0)); + } return std::make_unique< AverageAggregate>(resultType); case TypeKind::BIGINT: - return std::make_unique>( - resultType); + VELOX_USER_CHECK(resultType->isShortDecimal()); + return std::make_unique< + DecimalAverageAggregate>( + resultType, inputType->childAt(0)); case TypeKind::HUGEINT: - return std::make_unique>( - resultType); + VELOX_USER_CHECK(resultType->isLongDecimal()); + return std::make_unique< + DecimalAverageAggregate>( + resultType, inputType->childAt(0)); case TypeKind::VARBINARY: if (inputType->isLongDecimal()) { - return std::make_unique>( - resultType); + return std::make_unique< + DecimalAverageAggregate>( + resultType, inputType->childAt(0)); } else if ( inputType->isShortDecimal() || inputType->kind() == TypeKind::VARBINARY) { // If the input and out type are VARBINARY, then the // LongDecimalWithOverflowState is used and the template type // does not matter. - return std::make_unique>( - resultType); + return std::make_unique< + DecimalAverageAggregate>( + resultType, inputType->childAt(0)); } [[fallthrough]]; default: diff --git a/velox/functions/sparksql/aggregates/tests/AverageAggregationTest.cpp b/velox/functions/sparksql/aggregates/tests/AverageAggregationTest.cpp index 93057ef155a5..6dac5631ab92 100644 --- a/velox/functions/sparksql/aggregates/tests/AverageAggregationTest.cpp +++ b/velox/functions/sparksql/aggregates/tests/AverageAggregationTest.cpp @@ -110,5 +110,152 @@ TEST_F(AverageAggregationTest, avgAllNulls) { assertQuery(plan, expected); } +TEST_F(AverageAggregationTest, avgDecimal) { + int64_t kRescale = DecimalUtil::kPowersOfTen[4]; + // Short decimal aggregation + auto shortDecimal = makeNullableFlatVector( + {1'000, 2'000, 3'000, 4'000, 5'000, std::nullopt}, DECIMAL(10, 1)); + testAggregations( + {makeRowVector({shortDecimal})}, + {}, + {"spark_avg(c0)"}, + {}, + {makeRowVector({makeNullableFlatVector( + {3'000 * kRescale}, DECIMAL(14, 5))})}); + + // Long decimal aggregation + testAggregations( + {makeRowVector({makeNullableFlatVector( + {HugeInt::build(10, 100), + HugeInt::build(10, 200), + HugeInt::build(10, 300), + HugeInt::build(10, 400), + HugeInt::build(10, 500), + std::nullopt}, + DECIMAL(23, 4))})}, + {}, + {"spark_avg(c0)"}, + {}, + {makeRowVector({makeFlatVector( + std::vector{HugeInt::build(10, 300) * kRescale}, + DECIMAL(27, 8))})}); + + // The total sum overflows the max int128_t limit. + std::vector rawVector; + for (int i = 0; i < 10; ++i) { + rawVector.push_back(DecimalUtil::kLongDecimalMax); + } + testAggregations( + {makeRowVector({makeFlatVector(rawVector, DECIMAL(38, 0))})}, + {}, + {"spark_avg(c0)"}, + {}, + {makeRowVector({makeNullableFlatVector( + std::vector>{std::nullopt}, + DECIMAL(38, 4))})}); + + // The total sum underflows the min int128_t limit. + rawVector.clear(); + auto underFlowTestResult = makeNullableFlatVector( + std::vector>{std::nullopt}, DECIMAL(38, 4)); + for (int i = 0; i < 10; ++i) { + rawVector.push_back(DecimalUtil::kLongDecimalMin); + } + testAggregations( + {makeRowVector({makeFlatVector(rawVector, DECIMAL(38, 0))})}, + {}, + {"spark_avg(c0)"}, + {}, + {makeRowVector({underFlowTestResult})}); + + // Test constant vector. + testAggregations( + {makeRowVector({makeConstant(100, 10, DECIMAL(10, 2))})}, + {}, + {"spark_avg(c0)"}, + {}, + {makeRowVector({makeFlatVector( + std::vector{100 * kRescale}, DECIMAL(14, 6))})}); + + auto newSize = shortDecimal->size() * 2; + auto indices = makeIndices(newSize, [&](int row) { return row / 2; }); + auto dictVector = + VectorTestBase::wrapInDictionary(indices, newSize, shortDecimal); + + testAggregations( + {makeRowVector({dictVector})}, + {}, + {"spark_avg(c0)"}, + {}, + {makeRowVector({makeFlatVector( + std::vector{3'000 * kRescale}, DECIMAL(14, 5))})}); + + // Decimal average aggregation with multiple groups. + auto inputRows = { + makeRowVector( + {makeNullableFlatVector({1, 1}), + makeFlatVector({37220, 53450}, DECIMAL(15, 2))}), + makeRowVector( + {makeNullableFlatVector({2, 2}), + makeFlatVector({10410, 9250}, DECIMAL(15, 2))}), + makeRowVector( + {makeNullableFlatVector({3, 3}), + makeFlatVector({-12783, 0}, DECIMAL(15, 2))}), + makeRowVector( + {makeNullableFlatVector({1, 2}), + makeFlatVector({23178, 41093}, DECIMAL(15, 2))}), + makeRowVector( + {makeNullableFlatVector({2, 3}), + makeFlatVector({-10023, 5290}, DECIMAL(15, 2))}), + }; + + auto expectedResult = { + makeRowVector( + {makeNullableFlatVector({1}), + makeFlatVector(std::vector{379493333}, DECIMAL(19, 6))}), + makeRowVector( + {makeNullableFlatVector({2}), + makeFlatVector(std::vector{126825000}, DECIMAL(19, 6))}), + makeRowVector( + {makeNullableFlatVector({3}), + makeFlatVector(std::vector{-24976667}, DECIMAL(19, 6))})}; + + testAggregations(inputRows, {"c0"}, {"spark_avg(c1)"}, expectedResult); + + auto valueA = HugeInt::parse("11999999998800000000"); + auto valueB = HugeInt::parse("12000000000000000000"); + auto longDecimalInputRows = {makeRowVector( + {makeNullableFlatVector({1, 1, 1, 1, 1, 1, 1}), + makeFlatVector( + {valueA, valueA, valueA, valueB, valueB, valueB, valueB}, + DECIMAL(38, 18))})}; + + auto longDecimalExpectedResult = {makeRowVector( + {makeNullableFlatVector({1}), + makeFlatVector( + std::vector{HugeInt::parse("119999999994857142857143")}, + DECIMAL(38, 22))})}; + + testAggregations( + longDecimalInputRows, + {"c0"}, + {"spark_avg(c1)"}, + longDecimalExpectedResult); +} + +TEST_F(AverageAggregationTest, avgDecimalWithMultipleRowVectors) { + int64_t kRescale = DecimalUtil::kPowersOfTen[4]; + auto inputRows = { + makeRowVector({makeFlatVector({100, 200}, DECIMAL(15, 2))}), + makeRowVector({makeFlatVector({300, 400}, DECIMAL(15, 2))}), + makeRowVector({makeFlatVector({500, 600}, DECIMAL(15, 2))}), + }; + + auto expectedResult = {makeRowVector( + {makeFlatVector(std::vector{350 * kRescale}, DECIMAL(19, 6))})}; + + testAggregations(inputRows, {}, {"spark_avg(c0)"}, expectedResult); +} + } // namespace } // namespace facebook::velox::functions::aggregate::sparksql::test From 86c480ce7c1e9b49a73140176b5e7c9dbe4c5b15 Mon Sep 17 00:00:00 2001 From: Hongze Zhang Date: Fri, 29 Dec 2023 10:31:07 +0800 Subject: [PATCH 03/10] [oap ] Register merge extract companion agg functions without suffix --- velox/exec/AggregateCompanionAdapter.cpp | 50 ++++++++++++++++++------ 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/velox/exec/AggregateCompanionAdapter.cpp b/velox/exec/AggregateCompanionAdapter.cpp index ea3c64c5c9ea..f77400e952d0 100644 --- a/velox/exec/AggregateCompanionAdapter.cpp +++ b/velox/exec/AggregateCompanionAdapter.cpp @@ -267,10 +267,13 @@ bool CompanionFunctionsRegistrar::registerPartialFunction( const core::QueryConfig& config) -> std::unique_ptr { if (auto func = getAggregateFunctionEntry(name)) { + core::AggregationNode::Step usedStep{ + core::AggregationNode::Step::kPartial}; if (!exec::isRawInput(step)) { - step = core::AggregationNode::Step::kIntermediate; + usedStep = core::AggregationNode::Step::kIntermediate; } - auto fn = func->factory(step, argTypes, resultType, config); + auto fn = + func->factory(usedStep, argTypes, resultType, config); VELOX_CHECK_NOT_NULL(fn); return std::make_unique< AggregateCompanionAdapter::PartialFunction>( @@ -409,26 +412,51 @@ bool CompanionFunctionsRegistrar::registerMergeExtractFunction( const std::vector& signatures, const AggregateFunctionMetadata& metadata, bool overwrite) { + bool registered = false; if (CompanionSignatures::hasSameIntermediateTypesAcrossSignatures( signatures)) { - return registerMergeExtractFunctionWithSuffix( - name, signatures, metadata, overwrite); + registered |= + registerMergeExtractFunctionWithSuffix(name, signatures, metadata, overwrite); } auto mergeExtractSignatures = CompanionSignatures::mergeExtractFunctionSignatures(signatures); if (mergeExtractSignatures.empty()) { - return false; + return registered; } auto mergeExtractFunctionName = CompanionSignatures::mergeExtractFunctionName(name); - return registerMergeExtractFunctionInternal( - name, - mergeExtractFunctionName, - std::move(mergeExtractSignatures), - metadata, - overwrite); + registered |= + exec::registerAggregateFunction( + mergeExtractFunctionName, + std::move(mergeExtractSignatures), + [name, mergeExtractFunctionName]( + core::AggregationNode::Step /*step*/, + const std::vector& argTypes, + const TypePtr& resultType, + const core::QueryConfig& config) -> std::unique_ptr { + if (auto func = getAggregateFunctionEntry(name)) { + auto fn = func->factory( + core::AggregationNode::Step::kFinal, + argTypes, + resultType, + config); + VELOX_CHECK_NOT_NULL(fn); + return std::make_unique< + AggregateCompanionAdapter::MergeExtractFunction>( + std::move(fn), resultType); + } + VELOX_FAIL( + "Original aggregation function {} not found: {}", + name, + mergeExtractFunctionName); + }, + metadata, + /*registerCompanionFunctions*/ false, + overwrite) + .mainFunction; + return registered; } VectorFunctionFactory getVectorFunctionFactory( From 4331529c320654425fd7972c0338025358b8deeb Mon Sep 17 00:00:00 2001 From: Rui Mo Date: Fri, 14 Mar 2025 11:44:17 +0000 Subject: [PATCH 04/10] [11067] Support scan filter for decimal in ORC --- velox/dwio/dwrf/reader/ReaderBase.cpp | 14 +- .../reader/SelectiveDecimalColumnReader.cpp | 180 ++++++++++++++++-- .../reader/SelectiveDecimalColumnReader.h | 19 +- velox/dwio/dwrf/test/E2EFilterTest.cpp | 86 ++++++++- velox/dwio/dwrf/utils/ProtoUtils.cpp | 19 +- velox/dwio/dwrf/writer/ColumnWriter.cpp | 28 ++- velox/dwio/dwrf/writer/Writer.cpp | 7 +- 7 files changed, 319 insertions(+), 34 deletions(-) diff --git a/velox/dwio/dwrf/reader/ReaderBase.cpp b/velox/dwio/dwrf/reader/ReaderBase.cpp index 71dbc2e41510..16bd779b500a 100644 --- a/velox/dwio/dwrf/reader/ReaderBase.cpp +++ b/velox/dwio/dwrf/reader/ReaderBase.cpp @@ -343,13 +343,19 @@ std::shared_ptr ReaderBase::convertType( return SMALLINT(); case TypeKind::INTEGER: return INTEGER(); - case TypeKind::BIGINT: + case TypeKind::BIGINT: { + TypePtr converted; if (type.format() == DwrfFormat::kOrc && type.getOrcPtr()->kind() == proto::orc::Type_Kind_DECIMAL) { - return DECIMAL( - type.getOrcPtr()->precision(), type.getOrcPtr()->scale()); + converted = + DECIMAL(type.getOrcPtr()->precision(), type.getOrcPtr()->scale()); + } else { + converted = BIGINT(); + common::testutil::TestValue::adjust( + "facebook::velox::dwrf::ReaderBase::convertType", &converted); } - return BIGINT(); + return converted; + } case TypeKind::HUGEINT: if (type.format() == DwrfFormat::kOrc && type.getOrcPtr()->kind() == proto::orc::Type_Kind_DECIMAL) { diff --git a/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.cpp index ec570ae05b7f..ba14222b1dc5 100644 --- a/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.cpp @@ -75,16 +75,17 @@ void SelectiveDecimalColumnReader::seekToRowGroup(int64_t index) { template template -void SelectiveDecimalColumnReader::readHelper(RowSet rows) { - vector_size_t numRows = rows.back() + 1; +void SelectiveDecimalColumnReader::readHelper( + common::Filter* filter, + RowSet rows) { ExtractToReader extractValues(this); - common::AlwaysTrue filter; + common::AlwaysTrue alwaysTrue; DirectRleColumnVisitor< int64_t, common::AlwaysTrue, decltype(extractValues), kDense> - visitor(filter, this, rows, extractValues); + visitor(alwaysTrue, this, rows, extractValues); // decode scale stream if (version_ == velox::dwrf::RleVersion_1) { @@ -104,14 +105,161 @@ void SelectiveDecimalColumnReader::readHelper(RowSet rows) { // reset numValues_ before reading values numValues_ = 0; valueSize_ = sizeof(DataT); + vector_size_t numRows = rows.back() + 1; ensureValuesCapacity(numRows); // decode value stream facebook::velox::dwio::common:: ColumnVisitor - valueVisitor(filter, this, rows, extractValues); + valueVisitor(alwaysTrue, this, rows, extractValues); decodeWithVisitor>(valueDecoder_.get(), valueVisitor); readOffset_ += numRows; + + // Fill decimals before applying filter. + fillDecimals(); + + const auto rawNulls = nullsInReadRange_ + ? (kDense ? nullsInReadRange_->as() : rawResultNulls_) + : nullptr; + // Process filter. + process(filter, rows, rawNulls); +} + +template +void SelectiveDecimalColumnReader::processNulls( + bool isNull, + const RowSet& rows, + const uint64_t* rawNulls) { + if (!rawNulls) { + return; + } + returnReaderNulls_ = false; + anyNulls_ = !isNull; + allNull_ = isNull; + + auto rawDecimal = values_->asMutable(); + auto rawScale = scaleBuffer_->asMutable(); + + vector_size_t idx = 0; + if (isNull) { + for (vector_size_t i = 0; i < numValues_; i++) { + if (bits::isBitNull(rawNulls, i)) { + bits::setNull(rawResultNulls_, idx); + addOutputRow(rows[i]); + idx++; + } + } + } else { + for (vector_size_t i = 0; i < numValues_; i++) { + if (!bits::isBitNull(rawNulls, i)) { + bits::setNull(rawResultNulls_, idx, false); + rawDecimal[idx] = rawDecimal[i]; + rawScale[idx] = rawScale[i]; + addOutputRow(rows[i]); + idx++; + } + } + } +} + +template +void SelectiveDecimalColumnReader::processFilter( + const common::Filter* filter, + const RowSet& rows, + const uint64_t* rawNulls) { + VELOX_CHECK_NOT_NULL(filter, "Filter must not be null."); + returnReaderNulls_ = false; + anyNulls_ = false; + allNull_ = true; + + vector_size_t idx = 0; + auto rawDecimal = values_->asMutable(); + for (vector_size_t i = 0; i < numValues_; i++) { + if (rawNulls && bits::isBitNull(rawNulls, i)) { + if (filter->testNull()) { + bits::setNull(rawResultNulls_, idx); + addOutputRow(rows[i]); + anyNulls_ = true; + idx++; + } + } else { + bool tested; + if constexpr (std::is_same_v) { + tested = filter->testInt64(rawDecimal[i]); + } else { + tested = filter->testInt128(rawDecimal[i]); + } + + if (tested) { + if (rawNulls) { + bits::setNull(rawResultNulls_, idx, false); + } + rawDecimal[idx] = rawDecimal[i]; + addOutputRow(rows[i]); + allNull_ = false; + idx++; + } + } + } +} + +template +void SelectiveDecimalColumnReader::process( + const common::Filter* filter, + const RowSet& rows, + const uint64_t* rawNulls) { + // Treat the filter as kAlwaysTrue if any of the following conditions are met: + // 1) No filter found; + // 2) Filter is kIsNotNull but rawNulls == NULL (no elements is null). + auto filterKind = + !filter || (filter->kind() == common::FilterKind::kIsNotNull && !rawNulls) + ? common::FilterKind::kAlwaysTrue + : filter->kind(); + switch (filterKind) { + case common::FilterKind::kAlwaysTrue: + // Simply add all rows to output. + for (vector_size_t i = 0; i < numValues_; i++) { + addOutputRow(rows[i]); + } + break; + case common::FilterKind::kIsNull: + processNulls(true, rows, rawNulls); + break; + case common::FilterKind::kIsNotNull: + processNulls(false, rows, rawNulls); + break; + case common::FilterKind::kBigintRange: + case common::FilterKind::kBigintValuesUsingHashTable: + case common::FilterKind::kBigintValuesUsingBitmask: + case common::FilterKind::kNegatedBigintRange: + case common::FilterKind::kNegatedBigintValuesUsingHashTable: + case common::FilterKind::kNegatedBigintValuesUsingBitmask: + case common::FilterKind::kBigintMultiRange: { + if constexpr (std::is_same_v) { + processFilter(filter, rows, rawNulls); + } else { + const auto actualType = CppToType::create(); + VELOX_NYI( + "Expected type BIGINT, but found file type {}.", + actualType->toString()); + } + break; + } + case common::FilterKind::kHugeintValuesUsingHashTable: + case common::FilterKind::kHugeintRange: { + if constexpr (std::is_same_v) { + processFilter(filter, rows, rawNulls); + } else { + const auto actualType = CppToType::create(); + VELOX_NYI( + "Expected type HUGEINT, but found file type {}.", + actualType->toString()); + } + break; + } + default: + VELOX_NYI("Unsupported filter: {}.", static_cast(filterKind)); + } } template @@ -119,14 +267,20 @@ void SelectiveDecimalColumnReader::read( int64_t offset, const RowSet& rows, const uint64_t* incomingNulls) { - VELOX_CHECK(!scanSpec_->filter()); VELOX_CHECK(!scanSpec_->valueHook()); prepareRead(offset, rows, incomingNulls); + if (!resultNulls_ || !resultNulls_->unique() || + resultNulls_->capacity() * 8 < rows.size()) { + // Make sure a dedicated resultNulls_ is allocated with enough capacity as + // RleDecoder always assumes it is available. + resultNulls_ = AlignedBuffer::allocate(rows.size(), memoryPool_); + rawResultNulls_ = resultNulls_->asMutable(); + } bool isDense = rows.back() == rows.size() - 1; if (isDense) { - readHelper(rows); + readHelper(scanSpec_->filter(), rows); } else { - readHelper(rows); + readHelper(scanSpec_->filter(), rows); } } @@ -134,16 +288,18 @@ template void SelectiveDecimalColumnReader::getValues( const RowSet& rows, VectorPtr* result) { + rawValues_ = values_->asMutable(); + getIntValues(rows, requestedType_, result); +} + +template +void SelectiveDecimalColumnReader::fillDecimals() { auto nullsPtr = resultNulls() ? resultNulls()->template as() : nullptr; auto scales = scaleBuffer_->as(); auto values = values_->asMutable(); - DecimalUtil::fillDecimals( values, nullsPtr, values, scales, numValues_, scale_); - - rawValues_ = values_->asMutable(); - getIntValues(rows, requestedType_, result); } template class SelectiveDecimalColumnReader; diff --git a/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.h b/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.h index 67a82b051e36..4482ef47fc50 100644 --- a/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.h @@ -49,7 +49,24 @@ class SelectiveDecimalColumnReader : public SelectiveColumnReader { private: template - void readHelper(RowSet rows); + void readHelper(common::Filter* filter, RowSet rows); + + // Process IsNull and IsNotNull filters. + void processNulls(bool isNull, const RowSet& rows, const uint64_t* rawNulls); + + // Process filters on decimal values. + void processFilter( + const common::Filter* filter, + const RowSet& rows, + const uint64_t* rawNulls); + + // Dispatch to the respective filter processing based on the filter type. + void process( + const common::Filter* filter, + const RowSet& rows, + const uint64_t* rawNulls); + + void fillDecimals(); std::unique_ptr> valueDecoder_; std::unique_ptr> scaleDecoder_; diff --git a/velox/dwio/dwrf/test/E2EFilterTest.cpp b/velox/dwio/dwrf/test/E2EFilterTest.cpp index 43b67e91e550..6a56da80e891 100644 --- a/velox/dwio/dwrf/test/E2EFilterTest.cpp +++ b/velox/dwio/dwrf/test/E2EFilterTest.cpp @@ -15,6 +15,7 @@ */ #include "velox/common/base/Portability.h" +#include "velox/common/base/tests/GTestUtils.h" #include "velox/common/testutil/TestValue.h" #include "velox/dwio/common/tests/utils/E2EFilterTestBase.h" #include "velox/dwio/dwrf/reader/DwrfReader.h" @@ -64,11 +65,11 @@ class E2EFilterTest : public E2EFilterTestBase { const TypePtr& type, const std::vector& batches, bool forRowGroupSkip = false) override { - auto options = createWriterOptions(type); + setWriterOptions(type); int32_t flushCounter = 0; // If we test row group skip, we have all the data in one stripe. For // scan, we start a stripe every 'flushEveryNBatches_' batches. - options.flushPolicyFactory = [&]() { + options_.flushPolicyFactory = [&]() { return std::make_unique([&]() { return forRowGroupSkip ? false : (++flushCounter % flushEveryNBatches_ == 0); @@ -80,8 +81,8 @@ class E2EFilterTest : public E2EFilterTestBase { dwio::common::FileSink::Options{.pool = leafPool_.get()}); ASSERT_TRUE(sink->isBuffered()); auto* sinkPtr = sink.get(); - options.memoryPool = rootPool_.get(); - writer_ = std::make_unique(std::move(sink), options); + options_.memoryPool = rootPool_.get(); + writer_ = std::make_unique(std::move(sink), options_); for (auto& batch : batches) { writer_->write(batch); } @@ -105,9 +106,10 @@ class E2EFilterTest : public E2EFilterTestBase { } std::unordered_set flatMapColumns_; + dwrf::WriterOptions options_; private: - dwrf::WriterOptions createWriterOptions(const TypePtr& type) { + void setWriterOptions(const TypePtr& type) { auto config = std::make_shared(); config->set(dwrf::Config::COMPRESSION, CompressionKind_NONE); config->set(dwrf::Config::USE_VINTS, useVInts_); @@ -148,10 +150,8 @@ class E2EFilterTest : public E2EFilterTestBase { config->set>>( dwrf::Config::MAP_FLAT_COLS_STRUCT_KEYS, mapFlatColsStructKeys); } - dwrf::WriterOptions options; - options.config = config; - options.schema = writerSchema; - return options; + options_.config = config; + options_.schema = writerSchema; } std::unique_ptr writer_; @@ -227,6 +227,74 @@ TEST_F(E2EFilterTest, byteRle) { 20); } +DEBUG_ONLY_TEST_F(E2EFilterTest, shortDecimal) { + testutil::TestValue::enable(); + options_.format = DwrfFormat::kOrc; + const std::unordered_map types = { + {"shortdecimal_val:decimal(8, 5)", DECIMAL(8, 5)}, + {"shortdecimal_val:decimal(10, 5)", DECIMAL(10, 5)}, + {"shortdecimal_val:decimal(17, 5)", DECIMAL(17, 5)}}; + + for (const auto& pair : types) { + SCOPED_TESTVALUE_SET( + "facebook::velox::dwrf::ReaderBase::convertType", + std::function( + [&](TypePtr* type) { *type = pair.second; })); + testWithTypes( + pair.first, + [&]() { + makeIntDistribution( + "shortdecimal_val", + 10, // min + 100, // max + 22, // repeats + 19, // rareFrequency + -999, // rareMin + 30000, // rareMax + true); + }, + false, + {"shortdecimal_val"}, + 20); + } + options_.format = DwrfFormat::kDwrf; +} + +DEBUG_ONLY_TEST_F(E2EFilterTest, longDecimal) { + testutil::TestValue::enable(); + options_.format = DwrfFormat::kOrc; + const std::unordered_map types = { + {"longdecimal_val:decimal(30, 10)", DECIMAL(30, 10)}, + {"longdecimal_val:decimal(37, 15)", DECIMAL(37, 15)}}; + + SCOPED_TESTVALUE_SET( + "facebook::velox::dwrf::ProtoUtils::writeType", + std::function([&](bool* kindSet) { *kindSet = true; })); + for (const auto& pair : types) { + SCOPED_TESTVALUE_SET( + "facebook::velox::dwrf::ReaderBase::convertType", + std::function( + [&](TypePtr* type) { *type = pair.second; })); + testWithTypes( + pair.first, + [&]() { + makeIntDistribution( + "longdecimal_val", + 10, // min + 100, // max + 22, // repeats + 19, // rareFrequency + -999, // rareMin + 30000, // rareMax + true); + }, + false, + {"longdecimal_val"}, + 20); + } + options_.format = DwrfFormat::kDwrf; +} + TEST_F(E2EFilterTest, floatAndDouble) { testWithTypes( "float_val:float," diff --git a/velox/dwio/dwrf/utils/ProtoUtils.cpp b/velox/dwio/dwrf/utils/ProtoUtils.cpp index 405d2e79ddfb..e907c4126d3e 100644 --- a/velox/dwio/dwrf/utils/ProtoUtils.cpp +++ b/velox/dwio/dwrf/utils/ProtoUtils.cpp @@ -57,9 +57,22 @@ void ProtoUtils::writeType( if (parent) { parent->add_subtypes(footer.types_size() - 1); } - auto kind = - VELOX_STATIC_FIELD_DYNAMIC_DISPATCH(SchemaType, kind, type.kind()); - self->set_kind(kind); + bool kindSet = false; + if (type.kind() == TypeKind::HUGEINT) { + // Hugeint is not supported by DWRF, and this branch is only for ORC + // testing before the ORC footer write is implemented. + auto kind = SchemaType::kind; + self->set_kind(kind); + common::testutil::TestValue::adjust( + "facebook::velox::dwrf::ProtoUtils::writeType", &kindSet); + } else { + auto kind = + VELOX_STATIC_FIELD_DYNAMIC_DISPATCH(SchemaType, kind, type.kind()); + self->set_kind(kind); + kindSet = true; + } + VELOX_CHECK(kindSet, "Unknown type {}.", type.toString()); + switch (type.kind()) { case TypeKind::ROW: { auto& row = type.asRow(); diff --git a/velox/dwio/dwrf/writer/ColumnWriter.cpp b/velox/dwio/dwrf/writer/ColumnWriter.cpp index 2a4cf2077961..a1084f4d6104 100644 --- a/velox/dwio/dwrf/writer/ColumnWriter.cpp +++ b/velox/dwio/dwrf/writer/ColumnWriter.cpp @@ -2183,7 +2183,12 @@ std::unique_ptr BaseColumnWriter::create( context, type, sequence, onRecordPosition); ret->children_.reserve(type.size()); for (int32_t i = 0; i < type.size(); ++i) { - ret->children_.push_back(create(context, *type.childAt(i), sequence)); + ret->children_.push_back(create( + context, + *type.childAt(i), + sequence, + /*onRecordPosition=*/nullptr, + format)); } return ret; } @@ -2199,15 +2204,30 @@ std::unique_ptr BaseColumnWriter::create( } auto ret = std::make_unique( context, type, sequence, onRecordPosition); - ret->children_.push_back(create(context, *type.childAt(0), sequence)); - ret->children_.push_back(create(context, *type.childAt(1), sequence)); + ret->children_.push_back(create( + context, + *type.childAt(0), + sequence, + /*onRecordPosition=*/nullptr, + format)); + ret->children_.push_back(create( + context, + *type.childAt(1), + sequence, + /*onRecordPosition=*/nullptr, + format)); return ret; } case TypeKind::ARRAY: { VELOX_CHECK_EQ(type.size(), 1, "Array should have exactly one child"); auto ret = std::make_unique( context, type, sequence, onRecordPosition); - ret->children_.push_back(create(context, *type.childAt(0), sequence)); + ret->children_.push_back(create( + context, + *type.childAt(0), + sequence, + /*onRecordPosition=*/nullptr, + format)); return ret; } default: diff --git a/velox/dwio/dwrf/writer/Writer.cpp b/velox/dwio/dwrf/writer/Writer.cpp index d6011c38f8de..b5af93a2cc1b 100644 --- a/velox/dwio/dwrf/writer/Writer.cpp +++ b/velox/dwio/dwrf/writer/Writer.cpp @@ -200,7 +200,12 @@ Writer::Writer( } if (options.columnWriterFactory == nullptr) { - writer_ = BaseColumnWriter::create(writerBase_->getContext(), *schema_); + writer_ = BaseColumnWriter::create( + writerBase_->getContext(), + *schema_, + /*sequence=*/0, + /*onRecordPosition=*/nullptr, + options.format); } else { writer_ = options.columnWriterFactory(writerBase_->getContext(), *schema_); } From eee1cf6e8a1f414a0ed4bde4e3121cb8cdfef9d7 Mon Sep 17 00:00:00 2001 From: Ke Jia Date: Tue, 22 Apr 2025 08:58:25 +0000 Subject: [PATCH 05/10] [11771] [11772] Fix smj result mismatch issue --- velox/exec/MergeJoin.cpp | 206 ++++++++++++++--------------- velox/exec/MergeJoin.h | 33 ++++- velox/exec/MergeSource.cpp | 10 +- velox/exec/MergeSource.h | 1 + velox/exec/tests/MergeJoinTest.cpp | 135 +++++++++++++++++++ 5 files changed, 271 insertions(+), 114 deletions(-) diff --git a/velox/exec/MergeJoin.cpp b/velox/exec/MergeJoin.cpp index 550c059cc81f..96d27fe179d1 100644 --- a/velox/exec/MergeJoin.cpp +++ b/velox/exec/MergeJoin.cpp @@ -18,6 +18,8 @@ #include "velox/exec/Task.h" #include "velox/expression/FieldReference.h" +#include + namespace facebook::velox::exec { MergeJoin::MergeJoin( @@ -93,7 +95,7 @@ void MergeJoin::initialize() { joinNode_->isRightJoin() || joinNode_->isFullJoin()) { joinTracker_ = JoinTracker(outputBatchSize_, pool()); } - } else if (joinNode_->isAntiJoin()) { + } else if (joinNode_->isAntiJoin() || joinNode_->isFullJoin()) { // Anti join needs to track the left side rows that have no match on the // right. joinTracker_ = JoinTracker(outputBatchSize_, pool()); @@ -387,7 +389,8 @@ bool MergeJoin::tryAddOutputRow( const RowVectorPtr& leftBatch, vector_size_t leftRow, const RowVectorPtr& rightBatch, - vector_size_t rightRow) { + vector_size_t rightRow, + bool isRightJoinForFullOuter) { if (outputSize_ == outputBatchSize_) { return false; } @@ -421,12 +424,15 @@ bool MergeJoin::tryAddOutputRow( filterRightInputProjections_); if (joinTracker_) { - if (isRightJoin(joinType_)) { + if (isRightJoin(joinType_) || + (isFullJoin(joinType_) && isRightJoinForFullOuter)) { // Record right-side row with a match on the left-side. - joinTracker_->addMatch(rightBatch, rightRow, outputSize_); + joinTracker_->addMatch( + rightBatch, rightRow, outputSize_, isRightJoinForFullOuter); } else { // Record left-side row with a match on the right-side. - joinTracker_->addMatch(leftBatch, leftRow, outputSize_); + joinTracker_->addMatch( + leftBatch, leftRow, outputSize_, isRightJoinForFullOuter); } } } @@ -436,7 +442,8 @@ bool MergeJoin::tryAddOutputRow( if (isAntiJoin(joinType_)) { VELOX_CHECK(joinTracker_.has_value()); // Record left-side row with a match on the right-side. - joinTracker_->addMatch(leftBatch, leftRow, outputSize_); + joinTracker_->addMatch( + leftBatch, leftRow, outputSize_, isRightJoinForFullOuter); } ++outputSize_; @@ -455,14 +462,14 @@ bool MergeJoin::prepareOutput( return true; } - if (isRightJoin(joinType_) && right != currentRight_) { - return true; - } - // If there is a new right, we need to flatten the dictionary. if (!isRightFlattened_ && right && currentRight_ != right) { flattenRightProjections(); } + + if (right != currentRight_) { + return true; + } return false; } @@ -574,6 +581,39 @@ bool MergeJoin::prepareOutput( bool MergeJoin::addToOutput() { if (isRightJoin(joinType_) || isRightSemiFilterJoin(joinType_)) { return addToOutputForRightJoin(); + } else if (isFullJoin(joinType_) && filter_) { + if (!leftForRightJoinMatch_) { + leftForRightJoinMatch_ = leftMatch_; + rightForRightJoinMatch_ = rightMatch_; + } + + if (leftMatch_ && rightMatch_ && !leftJoinForFullFinished_) { + auto left = addToOutputForLeftJoin(); + if (!leftMatch_) { + leftJoinForFullFinished_ = true; + } + if (left) { + if (!leftMatch_) { + leftMatch_ = leftForRightJoinMatch_; + rightMatch_ = rightForRightJoinMatch_; + } + + return true; + } + } + + if (!leftMatch_ && !rightJoinForFullFinished_) { + leftMatch_ = leftForRightJoinMatch_; + rightMatch_ = rightForRightJoinMatch_; + rightJoinForFullFinished_ = true; + } + + auto right = addToOutputForRightJoin(); + + leftForRightJoinMatch_ = leftMatch_; + rightForRightJoinMatch_ = rightMatch_; + + return right; } else { return addToOutputForLeftJoin(); } @@ -720,7 +760,13 @@ bool MergeJoin::addToOutputForRightJoin() { } for (auto j = leftStartRow; j < leftEndRow; ++j) { - if (!tryAddOutputRow(leftBatch, j, rightBatch, i)) { + auto isRightJoinForFullOuter = false; + if (isFullJoin(joinType_)) { + isRightJoinForFullOuter = true; + } + + if (!tryAddOutputRow( + leftBatch, j, rightBatch, i, isRightJoinForFullOuter)) { // If we run out of space in the current output_, we will need to // produce a buffer and continue processing left later. In this // case, we cannot leave left as a lazy vector, since we cannot have @@ -818,7 +864,7 @@ RowVectorPtr MergeJoin::getOutput() { continue; } else if (isAntiJoin(joinType_)) { output = filterOutputForAntiJoin(output); - if (output) { + if (output != nullptr && output->size() > 0) { return output; } @@ -926,6 +972,8 @@ RowVectorPtr MergeJoin::doGetOutput() { // results from the current match. if (addToOutput()) { return std::move(output_); + } else { + previousLeftMatch_ = leftMatch_; } } @@ -990,6 +1038,8 @@ RowVectorPtr MergeJoin::doGetOutput() { if (addToOutput()) { return std::move(output_); + } else { + previousLeftMatch_ = leftMatch_; } } @@ -1134,7 +1184,7 @@ RowVectorPtr MergeJoin::doGetOutput() { isFullJoin(joinType_)) { // If output_ is currently wrapping a different buffer, return it // first. - if (prepareOutput(input_, nullptr)) { + if (prepareOutput(input_, rightInput_)) { output_->resize(outputSize_); return std::move(output_); } @@ -1159,7 +1209,7 @@ RowVectorPtr MergeJoin::doGetOutput() { if (isRightJoin(joinType_) || isFullJoin(joinType_)) { // If output_ is currently wrapping a different buffer, return it // first. - if (prepareOutput(nullptr, rightInput_)) { + if (prepareOutput(input_, rightInput_)) { output_->resize(outputSize_); return std::move(output_); } @@ -1211,6 +1261,8 @@ RowVectorPtr MergeJoin::doGetOutput() { endRightRow < rightInput_->size(), std::nullopt}; + leftJoinForFullFinished_ = false; + rightJoinForFullFinished_ = false; if (!leftMatch_->complete || !rightMatch_->complete) { if (!leftMatch_->complete) { // Need to continue looking for the end of match. @@ -1239,6 +1291,8 @@ RowVectorPtr MergeJoin::doGetOutput() { if (addToOutput()) { return std::move(output_); + } else { + previousLeftMatch_ = leftMatch_; } if (!rightInput_) { @@ -1255,8 +1309,6 @@ RowVectorPtr MergeJoin::doGetOutput() { RowVectorPtr MergeJoin::applyFilter(const RowVectorPtr& output) { const auto numRows = output->size(); - RowVectorPtr fullOuterOutput = nullptr; - BufferPtr indices = allocateIndices(numRows, pool()); auto* rawIndices = indices->asMutable(); vector_size_t numPassed = 0; @@ -1273,76 +1325,29 @@ RowVectorPtr MergeJoin::applyFilter(const RowVectorPtr& output) { // If all matches for a given left-side row fail the filter, add a row to // the output with nulls for the right-side columns. - const auto onMiss = [&](auto row) { - if (isAntiJoin(joinType_)) { - return; - } - rawIndices[numPassed++] = row; - - if (isFullJoin(joinType_)) { - // For filtered rows, it is necessary to insert additional data - // to ensure the result set is complete. Specifically, we - // need to generate two records: one record containing the - // columns from the left table along with nulls for the - // right table, and another record containing the columns - // from the right table along with nulls for the left table. - // For instance, the current output is filtered based on the condition - // t > 1. - - // 1, 1 - // 2, 2 - // 3, 3 - - // In this scenario, we need to additionally insert a record 1, 1. - // Subsequently, we will set the values of the columns on the left to - // null and the values of the columns on the right to null as well. By - // doing so, we will obtain the final result set. - - // 1, null - // null, 1 - // 2, 2 - // 3, 3 - fullOuterOutput = BaseVector::create( - output->type(), output->size() + 1, pool()); - - for (auto i = 0; i < row + 1; ++i) { - for (auto j = 0; j < output->type()->size(); ++j) { - fullOuterOutput->childAt(j)->copy( - output->childAt(j).get(), i, i, 1); + auto onMiss = [&](auto row, bool flag) { + if (!isLeftSemiFilterJoin(joinType_) && + !isRightSemiFilterJoin(joinType_)) { + rawIndices[numPassed++] = row; + + if (!isRightJoin(joinType_)) { + if (isFullJoin(joinType_) && flag) { + for (auto& projection : leftProjections_) { + auto target = output->childAt(projection.outputChannel); + target->setNull(row, true); + } + } else { + for (auto& projection : rightProjections_) { + auto target = output->childAt(projection.outputChannel); + target->setNull(row, true); + } } - } - - for (auto j = 0; j < output->type()->size(); ++j) { - fullOuterOutput->childAt(j)->copy( - output->childAt(j).get(), row + 1, row, 1); - } - - for (auto i = row + 1; i < output->size(); ++i) { - for (auto j = 0; j < output->type()->size(); ++j) { - fullOuterOutput->childAt(j)->copy( - output->childAt(j).get(), i + 1, i, 1); + } else { + for (auto& projection : leftProjections_) { + auto target = output->childAt(projection.outputChannel); + target->setNull(row, true); } } - - for (auto& projection : leftProjections_) { - auto& target = fullOuterOutput->childAt(projection.outputChannel); - target->setNull(row, true); - } - - for (auto& projection : rightProjections_) { - auto& target = fullOuterOutput->childAt(projection.outputChannel); - target->setNull(row + 1, true); - } - } else if (!isRightJoin(joinType_)) { - for (auto& projection : rightProjections_) { - auto& target = output->childAt(projection.outputChannel); - target->setNull(row, true); - } - } else { - for (auto& projection : leftProjections_) { - auto& target = output->childAt(projection.outputChannel); - target->setNull(row, true); - } } }; @@ -1353,12 +1358,8 @@ RowVectorPtr MergeJoin::applyFilter(const RowVectorPtr& output) { joinTracker_->processFilterResult(i, passed, onMiss); - if (isAntiJoin(joinType_)) { - if (!passed) { - rawIndices[numPassed++] = i; - } - } else { - if (passed) { + if (!isAntiJoin(joinType_)) { + if (passed && !joinTracker_->isRightJoinForFullOuter(i)) { rawIndices[numPassed++] = i; } } @@ -1371,19 +1372,19 @@ RowVectorPtr MergeJoin::applyFilter(const RowVectorPtr& output) { // Every time we start a new left key match, `processFilterResult()` will // check if at least one row from the previous match passed the filter. If - // none did, it calls onMiss to add a record with null right projections to - // the output. + // none did, it calls onMiss to add a record with null right projections + // to the output. // // Before we leave the current buffer, since we may not have seen the next - // left key match yet, the last key match may still be pending to produce a - // row (because `processFilterResult()` was not called yet). + // left key match yet, the last key match may still be pending to produce + // a row (because `processFilterResult()` was not called yet). // // To handle this, we need to call `noMoreFilterResults()` unless the - // same current left key match may continue in the next buffer. So there are - // two cases to check: + // same current left key match may continue in the next buffer. So there + // are two cases to check: // - // 1. If leftMatch_ is nullopt, there for sure the next buffer will contain - // a different key match. + // 1. If leftMatch_ is nullopt, there for sure the next buffer will + // contain a different key match. // // 2. leftMatch_ may not be nullopt, but may be related to a different // (subsequent) left key. So we check if the last row in the batch has the @@ -1391,6 +1392,10 @@ RowVectorPtr MergeJoin::applyFilter(const RowVectorPtr& output) { if (!leftMatch_ || !joinTracker_->isCurrentLeftMatch(numRows - 1)) { joinTracker_->noMoreFilterResults(onMiss); } + + if (isAntiJoin(joinType_) && leftMatch_ && !previousLeftMatch_) { + joinTracker_->noMoreFilterResults(onMiss); + } } else { filterRows_.resize(numRows); filterRows_.setAll(); @@ -1412,17 +1417,10 @@ RowVectorPtr MergeJoin::applyFilter(const RowVectorPtr& output) { if (numPassed == numRows) { // All rows passed. - if (fullOuterOutput) { - return fullOuterOutput; - } return output; } // Some, but not all rows passed. - if (fullOuterOutput) { - return wrap(numPassed, indices, fullOuterOutput); - } - return wrap(numPassed, indices, output); } diff --git a/velox/exec/MergeJoin.h b/velox/exec/MergeJoin.h index 47dea482554d..674f932d0678 100644 --- a/velox/exec/MergeJoin.h +++ b/velox/exec/MergeJoin.h @@ -246,7 +246,8 @@ class MergeJoin : public Operator { const RowVectorPtr& leftBatch, vector_size_t leftRow, const RowVectorPtr& rightBatch, - vector_size_t rightRow); + vector_size_t rightRow, + bool isRightJoinForFullOuter = false); // If the right side projected columns in the current output vector happen to // span more than one vector from the right side, they cannot be simply @@ -340,6 +341,9 @@ class MergeJoin : public Operator { : matchingRows_{numRows, false} { leftRowNumbers_ = AlignedBuffer::allocate(numRows, pool); rawLeftRowNumbers_ = leftRowNumbers_->asMutable(); + + rightJoinRows_ = AlignedBuffer::allocate(numRows, pool); + rawRightJoinRows_ = rightJoinRows_->asMutable(); } // Records a row of output that corresponds to a match between a left-side @@ -350,7 +354,8 @@ class MergeJoin : public Operator { void addMatch( const VectorPtr& vector, vector_size_t row, - vector_size_t outputIndex) { + vector_size_t outputIndex, + bool rightJoinForFullOuter = false) { matchingRows_.setValid(outputIndex, true); if (lastVector_ != vector || lastIndex_ != row) { @@ -361,6 +366,7 @@ class MergeJoin : public Operator { } rawLeftRowNumbers_[outputIndex] = lastLeftRowNumber_; + rawRightJoinRows_[outputIndex] = rightJoinForFullOuter; } // Returns a subset of "match" rows in [0, numRows) range that were @@ -402,7 +408,7 @@ class MergeJoin : public Operator { const auto rowNumber = rawLeftRowNumbers_[outputIndex]; if (currentLeftRowNumber_ != rowNumber) { if (currentRow_ != -1 && !currentRowPassed_) { - onMiss(currentRow_); + onMiss(currentRow_, rawRightJoinRows_[currentRow_]); } currentRow_ = outputIndex; currentLeftRowNumber_ = rowNumber; @@ -428,8 +434,8 @@ class MergeJoin : public Operator { // filter failed for all matches of that row. template void noMoreFilterResults(TOnMiss onMiss) { - if (!currentRowPassed_) { - onMiss(currentRow_); + if (!currentRowPassed_ && currentRow_ >= 0) { + onMiss(currentRow_, rawRightJoinRows_[currentRow_]); } currentRow_ = -1; @@ -437,6 +443,10 @@ class MergeJoin : public Operator { } void reset(); + + bool isRightJoinForFullOuter(vector_size_t row) { + return rawRightJoinRows_[row]; + } private: // A subset of output rows where left side matched right side on the join @@ -457,6 +467,9 @@ class MergeJoin : public Operator { BufferPtr leftRowNumbers_; vector_size_t* rawLeftRowNumbers_; + BufferPtr rightJoinRows_; + bool* rawRightJoinRows_; + // Synthetic number assigned to the last added "match" row or zero if no row // has been added yet. vector_size_t lastLeftRowNumber_{0}; @@ -552,6 +565,9 @@ class MergeJoin : public Operator { // A set of rows with matching keys on the left side. std::optional leftMatch_; + std::optional previousLeftMatch_ = + Match{{}, -1, -1, false, std::nullopt}; + // A set of rows with matching keys on the right side. std::optional rightMatch_; @@ -568,5 +584,12 @@ class MergeJoin : public Operator { bool leftHasDrained_{false}; bool rightHasDrained_{false}; + bool leftJoinForFullFinished_{false}; + + bool rightJoinForFullFinished_{false}; + + std::optional leftForRightJoinMatch_; + + std::optional rightForRightJoinMatch_; }; } // namespace facebook::velox::exec diff --git a/velox/exec/MergeSource.cpp b/velox/exec/MergeSource.cpp index 411decd83e6e..cc9a4a4be834 100644 --- a/velox/exec/MergeSource.cpp +++ b/velox/exec/MergeSource.cpp @@ -318,9 +318,9 @@ BlockingReason MergeJoinSource::next( "facebook::velox::exec::MergeJoinSource::next", this); ScopedPromiseNotification notification(1); return state_.withWLock([&](auto& state) { - VELOX_CHECK_LE(!!state.atEnd + !!state.drained, 1); - if (state.data != nullptr) { - *data = std::move(state.data); + if (!state.dataQueue.empty()) { + *data = std::move(state.dataQueue.front()); + state.dataQueue.pop(); deferNotify(producerPromise_, notification); return BlockingReason::kNotBlocked; @@ -369,9 +369,9 @@ BlockingReason MergeJoinSource::enqueue( return BlockingReason::kNotBlocked; } - VELOX_CHECK_NULL(state.data); - state.data = std::move(data); + state.dataQueue.push(std::move(data)); deferNotify(consumerPromise_, notification); + return waitForConsumer(future); }); } diff --git a/velox/exec/MergeSource.h b/velox/exec/MergeSource.h index e5892add6ed9..52961a679521 100644 --- a/velox/exec/MergeSource.h +++ b/velox/exec/MergeSource.h @@ -94,6 +94,7 @@ class MergeJoinSource { // after the consumer receives the drained signal. bool drained = false; RowVectorPtr data; + std::queue dataQueue; }; folly::Synchronized state_; diff --git a/velox/exec/tests/MergeJoinTest.cpp b/velox/exec/tests/MergeJoinTest.cpp index 5714301a2f3b..7323a1ca1dea 100644 --- a/velox/exec/tests/MergeJoinTest.cpp +++ b/velox/exec/tests/MergeJoinTest.cpp @@ -1306,6 +1306,45 @@ TEST_F(MergeJoinTest, fullOuterJoin) { "SELECT * FROM t FULL OUTER JOIN u ON t.t0 = u.u0 AND t.t0 > 2"); } +TEST_F(MergeJoinTest, fullOuterJoinWithDuplicateMatch) { + // Each row on the left side has at most one match on the right side. + auto left = makeRowVector( + {"a", "b"}, + { + makeNullableFlatVector({1, 2, 2, 2, 3, 5, 6, std::nullopt}), + makeNullableFlatVector( + {2.0, 100.0, 1.0, 1.0, 3.0, 1.0, 6.0, std::nullopt}), + }); + + auto right = makeRowVector( + {"c", "d"}, + { + makeNullableFlatVector( + {0, 2, 2, 2, 2, 3, 4, 5, 7, std::nullopt}), + makeNullableFlatVector( + {0.0, 3.0, -1.0, -1.0, 3.0, 2.0, 1.0, 3.0, 7.0, std::nullopt}), + }); + + createDuckDbTable("t", {left}); + createDuckDbTable("u", {right}); + + auto planNodeIdGenerator = std::make_shared(); + + auto rightPlan = + PlanBuilder(planNodeIdGenerator) + .values({left}) + .mergeJoin( + {"a"}, + {"c"}, + PlanBuilder(planNodeIdGenerator).values({right}).planNode(), + "b < d", + {"a", "b", "c", "d"}, + core::JoinType::kFull) + .planNode(); + AssertQueryBuilder(rightPlan, duckDbQueryRunner_) + .assertResults("SELECT * from t FULL OUTER JOIN u ON a = c AND b < d"); +} + TEST_F(MergeJoinTest, fullOuterJoinNoFilter) { auto left = makeRowVector( {"t0", "t1", "t2", "t3"}, @@ -1811,3 +1850,99 @@ TEST_F(MergeJoinTest, barrier) { } } } + +TEST_F( + MergeJoinTest, + antiJoinWithFilterWithMultiMatchedRowsInDifferentBatches) { + auto left = + makeRowVector({"t0"}, {makeNullableFlatVector({1, 2, 3})}); + + auto right = + makeRowVector({"u0"}, {makeNullableFlatVector({1, 2, 3})}); + + createDuckDbTable("t", {left}); + createDuckDbTable("u", {right}); + + // Anti join. + auto planNodeIdGenerator = std::make_shared(); + auto plan = PlanBuilder(planNodeIdGenerator) + .values({split(left, 2)}) + .mergeJoin( + {"t0"}, + {"u0"}, + PlanBuilder(planNodeIdGenerator) + .values(split(right, 2)) + .planNode(), + "t0 > 2", + {"t0"}, + core::JoinType::kAnti) + .planNode(); + + AssertQueryBuilder(plan, duckDbQueryRunner_) + .config(core::QueryConfig::kPreferredOutputBatchRows, "2") + .config(core::QueryConfig::kMaxOutputBatchRows, "2") + .assertResults( + "SELECT t0 FROM t WHERE NOT exists (select 1 from u where t0 = u0 AND t.t0 > 2 ) "); +} + +TEST_F(MergeJoinTest, antiJoinWithFilterWithMultiMatchedRows) { + auto left = makeRowVector({"t0"}, {makeNullableFlatVector({1, 2})}); + + auto right = + makeRowVector({"u0"}, {makeNullableFlatVector({1, 2, 2, 2})}); + + createDuckDbTable("t", {left}); + createDuckDbTable("u", {right}); + + // Anti join. + auto planNodeIdGenerator = std::make_shared(); + auto plan = + PlanBuilder(planNodeIdGenerator) + .values({left}) + .mergeJoin( + {"t0"}, + {"u0"}, + PlanBuilder(planNodeIdGenerator).values({right}).planNode(), + "t0 > 2", + {"t0"}, + core::JoinType::kAnti) + .planNode(); + + AssertQueryBuilder(plan, duckDbQueryRunner_) + .assertResults( + "SELECT t0 FROM t WHERE NOT exists (select 1 from u where t0 = u0 AND t.t0 > 2 ) "); +} + +TEST_F(MergeJoinTest, antiJoinWithTwoJoinKeysInDifferentBatch) { + auto left = makeRowVector( + {"a", "b"}, + {makeNullableFlatVector({1, 1, 1, 1}), + makeNullableFlatVector({3.0, 3.0, 3.0, 3.0})}); + + auto right = makeRowVector( + {"c", "d"}, + {makeNullableFlatVector({1, 1, 1}), + makeNullableFlatVector({2.0, 2.0, 4.0})}); + + createDuckDbTable("t", {left}); + createDuckDbTable("u", {right}); + + // Anti join. + auto planNodeIdGenerator = std::make_shared(); + auto plan = PlanBuilder(planNodeIdGenerator) + .values({split(left, 2)}) + .mergeJoin( + {"a"}, + {"c"}, + PlanBuilder(planNodeIdGenerator) + .values({split(right, 2)}) + .planNode(), + "b < d", + {"a", "b"}, + core::JoinType::kAnti) + .planNode(); + + AssertQueryBuilder(plan, duckDbQueryRunner_) + .assertResults( + "SELECT * FROM t WHERE NOT exists (select * from u where t.a = u.c and t.b < u.d)"); +} \ No newline at end of file From d041885b61d50e02ff965e8bfeb396e376257861 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Thu, 15 May 2025 10:02:34 +0100 Subject: [PATCH 06/10] Revert "fix(parquet): Avoid SEGV if table column type does not match file column type (#12350)" This reverts commit 5ad65e4a487ea4a43dc808802726a479b017b1b8. --- velox/dwio/parquet/reader/ParquetReader.cpp | 170 ++++-------------- .../tests/reader/ParquetReaderTest.cpp | 42 ----- .../tests/reader/ParquetTableScanTest.cpp | 76 -------- 3 files changed, 32 insertions(+), 256 deletions(-) diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index 7a90be1879ad..7bd97c9dca2e 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -719,70 +719,62 @@ TypePtr ReaderBase::convertType( schemaElement.__isset.type_length, "FIXED_LEN_BYTE_ARRAY requires length to be set"); - static std::string_view kTypeMappingErrorFmtStr = - "Converted type {} is not allowed for requested type {}"; if (schemaElement.__isset.converted_type) { switch (schemaElement.converted_type) { case thrift::ConvertedType::INT_8: - case thrift::ConvertedType::UINT_8: VELOX_CHECK_EQ( schemaElement.type, thrift::Type::INT32, - "{} converted type can only be set for value of thrift::Type::INT32", - schemaElement.converted_type); - VELOX_CHECK( - !requestedType || requestedType->kind() == TypeKind::TINYINT || - requestedType->kind() == TypeKind::SMALLINT || - requestedType->kind() == TypeKind::INTEGER || - requestedType->kind() == TypeKind::BIGINT, - kTypeMappingErrorFmtStr, - "TINYINT", - requestedType->toString()); + "INT8 converted type can only be set for value of thrift::Type::INT32"); return TINYINT(); case thrift::ConvertedType::INT_16: - case thrift::ConvertedType::UINT_16: VELOX_CHECK_EQ( schemaElement.type, thrift::Type::INT32, - "{} converted type can only be set for value of thrift::Type::INT32", - schemaElement.converted_type); - VELOX_CHECK( - !requestedType || requestedType->kind() == TypeKind::SMALLINT || - requestedType->kind() == TypeKind::INTEGER || - requestedType->kind() == TypeKind::BIGINT, - kTypeMappingErrorFmtStr, - "SMALLINT", - requestedType->toString()); + "INT16 converted type can only be set for value of thrift::Type::INT32"); return SMALLINT(); case thrift::ConvertedType::INT_32: - case thrift::ConvertedType::UINT_32: VELOX_CHECK_EQ( schemaElement.type, thrift::Type::INT32, - "{} converted type can only be set for value of thrift::Type::INT32", - schemaElement.converted_type); - VELOX_CHECK( - !requestedType || requestedType->kind() == TypeKind::INTEGER || - requestedType->kind() == TypeKind::BIGINT, - kTypeMappingErrorFmtStr, - "INTEGER", - requestedType->toString()); + "INT32 converted type can only be set for value of thrift::Type::INT32"); return INTEGER(); case thrift::ConvertedType::INT_64: + VELOX_CHECK_EQ( + schemaElement.type, + thrift::Type::INT64, + "INT64 converted type can only be set for value of thrift::Type::INT64"); + return BIGINT(); + + case thrift::ConvertedType::UINT_8: + VELOX_CHECK_EQ( + schemaElement.type, + thrift::Type::INT32, + "UINT_8 converted type can only be set for value of thrift::Type::INT32"); + return TINYINT(); + + case thrift::ConvertedType::UINT_16: + VELOX_CHECK_EQ( + schemaElement.type, + thrift::Type::INT32, + "UINT_16 converted type can only be set for value of thrift::Type::INT32"); + return SMALLINT(); + + case thrift::ConvertedType::UINT_32: + VELOX_CHECK_EQ( + schemaElement.type, + thrift::Type::INT32, + "UINT_32 converted type can only be set for value of thrift::Type::INT32"); + return INTEGER(); + case thrift::ConvertedType::UINT_64: VELOX_CHECK_EQ( schemaElement.type, thrift::Type::INT64, - "{} converted type can only be set for value of thrift::Type::INT32", - schemaElement.converted_type); - VELOX_CHECK( - !requestedType || requestedType->kind() == TypeKind::BIGINT, - kTypeMappingErrorFmtStr, - "BIGINT", - requestedType->toString()); + "UINT_64 converted type can only be set for value of thrift::Type::INT64"); return BIGINT(); case thrift::ConvertedType::DATE: @@ -790,11 +782,6 @@ TypePtr ReaderBase::convertType( schemaElement.type, thrift::Type::INT32, "DATE converted type can only be set for value of thrift::Type::INT32"); - VELOX_CHECK( - !requestedType || requestedType->isDate(), - kTypeMappingErrorFmtStr, - "DATE", - requestedType->toString()); return DATE(); case thrift::ConvertedType::TIMESTAMP_MICROS: @@ -803,65 +790,19 @@ TypePtr ReaderBase::convertType( schemaElement.type, thrift::Type::INT64, "TIMESTAMP_MICROS or TIMESTAMP_MILLIS converted type can only be set for value of thrift::Type::INT64"); - VELOX_CHECK( - !requestedType || requestedType->kind() == TypeKind::TIMESTAMP, - kTypeMappingErrorFmtStr, - "TIMESTAMP", - requestedType->toString()); return TIMESTAMP(); case thrift::ConvertedType::DECIMAL: { VELOX_CHECK( schemaElement.__isset.precision && schemaElement.__isset.scale, "DECIMAL requires a length and scale specifier!"); - const auto schemaElementPrecision = schemaElement.precision; - const auto schemaElementScale = schemaElement.scale; - // A long decimal requested type cannot read a value of a short decimal. - // As a result, the mapping from short to long decimal is currently - // restricted. - auto type = DECIMAL(schemaElementPrecision, schemaElementScale); - if (requestedType) { - VELOX_CHECK( - requestedType->isDecimal(), - kTypeMappingErrorFmtStr, - "DECIMAL", - requestedType->toString()); - // Reading short decimals with a long decimal requested type is not - // yet possible. To allow for correct interpretation of the values, - // the scale of the file type and requested type must match while - // precision may be larger. - if (requestedType->isShortDecimal()) { - const auto& shortDecimalType = requestedType->asShortDecimal(); - VELOX_CHECK( - type->isShortDecimal() && - shortDecimalType.precision() >= schemaElementPrecision && - shortDecimalType.scale() == schemaElementScale, - kTypeMappingErrorFmtStr, - type->toString(), - requestedType->toString()); - } else { - const auto& longDecimalType = requestedType->asLongDecimal(); - VELOX_CHECK( - type->isLongDecimal() && - longDecimalType.precision() >= schemaElementPrecision && - longDecimalType.scale() == schemaElementScale, - kTypeMappingErrorFmtStr, - type->toString(), - requestedType->toString()); - } - } - return type; + return DECIMAL(schemaElement.precision, schemaElement.scale); } case thrift::ConvertedType::UTF8: switch (schemaElement.type) { case thrift::Type::BYTE_ARRAY: case thrift::Type::FIXED_LEN_BYTE_ARRAY: - VELOX_CHECK( - !requestedType || requestedType->kind() == TypeKind::VARCHAR, - kTypeMappingErrorFmtStr, - "VARCHAR", - requestedType->toString()); return VARCHAR(); default: VELOX_FAIL( @@ -872,11 +813,6 @@ TypePtr ReaderBase::convertType( schemaElement.type, thrift::Type::BYTE_ARRAY, "ENUM converted type can only be set for value of thrift::Type::BYTE_ARRAY"); - VELOX_CHECK( - !requestedType || requestedType->kind() == TypeKind::VARCHAR, - kTypeMappingErrorFmtStr, - "VARCHAR", - requestedType->toString()); return VARCHAR(); } case thrift::ConvertedType::MAP: @@ -895,69 +831,27 @@ TypePtr ReaderBase::convertType( } else { switch (schemaElement.type) { case thrift::Type::type::BOOLEAN: - VELOX_CHECK( - !requestedType || requestedType->kind() == TypeKind::BOOLEAN, - kTypeMappingErrorFmtStr, - "BOOLEAN", - requestedType->toString()); return BOOLEAN(); case thrift::Type::type::INT32: - VELOX_CHECK( - !requestedType || requestedType->kind() == TypeKind::INTEGER || - requestedType->kind() == TypeKind::BIGINT, - kTypeMappingErrorFmtStr, - "INTEGER", - requestedType->toString()); return INTEGER(); case thrift::Type::type::INT64: // For Int64 Timestamp in nano precision if (schemaElement.__isset.logicalType && schemaElement.logicalType.__isset.TIMESTAMP) { - VELOX_CHECK( - !requestedType || requestedType->kind() == TypeKind::TIMESTAMP, - kTypeMappingErrorFmtStr, - "TIMESTAMP", - requestedType->toString()); return TIMESTAMP(); } - VELOX_CHECK( - !requestedType || requestedType->kind() == TypeKind::BIGINT, - kTypeMappingErrorFmtStr, - "BIGINT", - requestedType->toString()); return BIGINT(); case thrift::Type::type::INT96: - VELOX_CHECK( - !requestedType || requestedType->kind() == TypeKind::TIMESTAMP, - kTypeMappingErrorFmtStr, - "TIMESTAMP", - requestedType->toString()); return TIMESTAMP(); // INT96 only maps to a timestamp case thrift::Type::type::FLOAT: - VELOX_CHECK( - !requestedType || requestedType->kind() == TypeKind::REAL || - requestedType->kind() == TypeKind::DOUBLE, - kTypeMappingErrorFmtStr, - "REAL", - requestedType->toString()); return REAL(); case thrift::Type::type::DOUBLE: - VELOX_CHECK( - !requestedType || requestedType->kind() == TypeKind::DOUBLE, - kTypeMappingErrorFmtStr, - "DOUBLE", - requestedType->toString()); return DOUBLE(); case thrift::Type::type::BYTE_ARRAY: case thrift::Type::type::FIXED_LEN_BYTE_ARRAY: if (requestedType && requestedType->isVarchar()) { return VARCHAR(); } else { - VELOX_CHECK( - !requestedType || requestedType->isVarbinary(), - kTypeMappingErrorFmtStr, - "VARBINARY", - requestedType->toString()); return VARBINARY(); } diff --git a/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp b/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp index f1ee25766dbf..4fd939106d04 100644 --- a/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include "velox/common/base/tests/GTestUtils.h" #include "velox/dwio/parquet/tests/ParquetTestBase.h" #include "velox/expression/ExprToSubfieldFilter.h" #include "velox/vector/tests/utils/VectorMaker.h" @@ -1692,44 +1691,3 @@ TEST_F(ParquetReaderTest, parquet251) { assertReadWithFilters( "parquet-251.parquet", rowType, std::move(filters), expected); } - -TEST_F(ParquetReaderTest, fileColumnVarcharToMetadataColumnMismatchTest) { - const std::string sample(getExampleFilePath("nation.parquet")); - - dwio::common::ReaderOptions readerOptions{leafPool_.get()}; - - auto runVarcharColTest = [&](const TypePtr& requestedType) { - // The type in the file is a BYTE_ARRAY resolving to VARCHAR. - // The requested type must match with what is requested as otherwise: - // - errors occur in the column readers - // - SIGSEGVs can be encountered during partitioning and subsequent - // operators following the table scan - auto outputRowType = - ROW({"nationkey", "name", "regionkey", "comment"}, - {BIGINT(), requestedType, BIGINT(), VARCHAR()}); - - // Sets the metadata schema requested, for example from Hive, and not the - // schema from the file. - readerOptions.setFileSchema(outputRowType); - VELOX_ASSERT_THROW( - createReader(sample, readerOptions), - fmt::format( - "Converted type VARCHAR is not allowed for requested type {}", - requestedType->toString())); - }; - - auto types = std::vector{ - SMALLINT(), - INTEGER(), - BIGINT(), - DECIMAL(10, 2), - REAL(), - DOUBLE(), - TIMESTAMP(), - DATE(), - VARBINARY()}; - - for (const auto& type : types) { - runVarcharColTest(type); - } -} diff --git a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp index e8140e6c07f0..63522267d61d 100644 --- a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp @@ -1390,82 +1390,6 @@ TEST_F(ParquetTableScanTest, singleBooleanRle) { assertSelect({"c2"}, "SELECT c2 FROM tmp"); } -TEST_F(ParquetTableScanTest, intToBigintRead) { - vector_size_t kSize = 100; - RowVectorPtr intDataFileVectors = makeRowVector( - {"c1"}, {makeFlatVector(kSize, [](auto row) { return row; })}); - - RowVectorPtr bigintDataFileVectors = makeRowVector( - {"c1"}, {makeFlatVector(kSize, [](auto row) { return row; })}); - - const std::shared_ptr dataFileFolder = - exec::test::TempDirectoryPath::create(); - auto filePath = dataFileFolder->getPath() + "/" + "data.parquet"; - WriterOptions options; - options.writeInt96AsTimestamp = false; - writeToParquetFile(filePath, {intDataFileVectors}, options); - - auto rowType = ROW({"c1"}, {BIGINT()}); - auto op = PlanBuilder() - .startTableScan() - .outputType(rowType) - .dataColumns(rowType) - .endTableScan() - .planNode(); - - auto split = makeSplit(filePath); - auto result = AssertQueryBuilder(op).split(split).copyResults(pool()); - auto rows = result->as(); - - assertEqualVectors(bigintDataFileVectors->childAt(0), rows->childAt(0)); -} - -TEST_F(ParquetTableScanTest, shortAndLongDecimalReadWithLargerPrecision) { - // decimal.parquet holds two columns (a: DECIMAL(5, 2), b: DECIMAL(20, 5)) and - // 20 rows (10 rows per group). Data is in plain uncompressed format: - // a: [100.01 .. 100.20] - // b: [100000000000000.00001 .. 100000000000000.00020] - // This test reads the DECIMAL(5, 2)a and DECIMAL(20, 5) file columns - // with DECIMAL(8, 2) and DECIMAL(22, 5) row types. - vector_size_t kSize = 20; - std::vector unscaledShortValues(kSize); - std::iota(unscaledShortValues.begin(), unscaledShortValues.end(), 10001); - std::vector longDecimalValues; - for (int i = 1; i <= kSize; ++i) { - if (i < 10) { - longDecimalValues.emplace_back( - HugeInt::parse(fmt::format("1000000000000000000{}", i))); - } else { - longDecimalValues.emplace_back( - HugeInt::parse(fmt::format("100000000000000000{}", i))); - } - } - - RowVectorPtr expectedDecimalVectors = makeRowVector( - {"c1", "c2"}, - {makeFlatVector(unscaledShortValues, DECIMAL(8, 2)), - makeFlatVector(longDecimalValues, DECIMAL(22, 5))}); - - const std::shared_ptr dataFileFolder = - exec::test::TempDirectoryPath::create(); - auto filePath = getExampleFilePath("decimal.parquet"); - - auto rowType = ROW({"c1", "c2"}, {DECIMAL(8, 2), DECIMAL(22, 5)}); - auto op = PlanBuilder() - .startTableScan() - .outputType(rowType) - .dataColumns(rowType) - .endTableScan() - .planNode(); - - auto split = makeSplit(filePath); - auto result = AssertQueryBuilder(op).split(split).copyResults(pool()); - auto rows = result->as(); - - assertEqualVectors(expectedDecimalVectors->childAt(0), rows->childAt(0)); - assertEqualVectors(expectedDecimalVectors->childAt(1), rows->childAt(1)); -} - int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); folly::Init init{&argc, &argv, false}; From c5c59b11969762d93ce567b0fab67d9689dfafb3 Mon Sep 17 00:00:00 2001 From: yingsu00 Date: Wed, 28 May 2025 21:05:28 +0800 Subject: [PATCH 07/10] Add diagrams --- velox/connectors/BeforeConnectorRefactor.puml | 151 +++++++++++ velox/connectors/after_connector_plugin.puml | 243 ++++++++++++++++++ .../before_connector_registery.puml | 186 ++++++++++++++ 3 files changed, 580 insertions(+) create mode 100644 velox/connectors/BeforeConnectorRefactor.puml create mode 100644 velox/connectors/after_connector_plugin.puml create mode 100644 velox/connectors/before_connector_registery.puml diff --git a/velox/connectors/BeforeConnectorRefactor.puml b/velox/connectors/BeforeConnectorRefactor.puml new file mode 100644 index 000000000000..e89babb1a73a --- /dev/null +++ b/velox/connectors/BeforeConnectorRefactor.puml @@ -0,0 +1,151 @@ +@startuml BeforeConnectorDesign +'--- Connector interfaces and coupling in the current monolithic design +package velox { + package connector { + + + interface ConnectorFactory { + +newConnector(...) <> + } + + interface Connector { + +createDataSource(...) : unique_ptr <> +' +createIndexSource(...) : shared_ptr <> + +createDataSink(...) : unique_ptr <> + +connectorConfig() : const shared_ptr& <> + +canAddDynamicFilter() : bool + } + + interface ConnectorSplit + interface ColumnHandle + interface ConnectorTableHandle + + interface DataSource { + +addSplit(std::shared_ptr split) : void <> + +next(std::shared_ptr split) : std::optional <> + } + + interface DataSink { + +appendData(RowVectorPtr input) : void <> + +finish() : bool <> + +close() : bool <> + +abort() : void <> + +stats() : Stats <> + } + + + + package hive { + class HiveConnector { + +createDataSource(...) : unique_ptr <> + +createDataSink(...) : unique_ptr <> + const std::shared_ptr hiveConfig_ + FileHandleFactory fileHandleFactory_ + std::shared_ptr metadata_ + } + Connector <|.. HiveConnector + + class HiveConnectorFactory + ConnectorFactory <|.. HiveConnectorFactory + + class HiveConnectorSplit + ConnectorSplit <|.. HiveConnectorSplit + + class HiveDataSource { + std::shared_ptr split_; + std::shared_ptr hiveTableHandle_; + std::shared_ptr scanSpec_; + std::unique_ptr splitReader_; + } + DataSource <|- HiveDataSource + + class HiveDataSink + DataSink <|- HiveDataSink + + class SplitReader { + +create(...) : unique_ptr <> + +prepareSplit() : void <> + +next(...) : uint64_t <> + } + + package iceberg { + class IcebergSplitReader + SplitReader <|- IcebergSplitReader + + class HiveIcebergSplit + HiveConnectorSplit <|- HiveIcebergSplit + + } + } + + package tpch { + class TpchConnector { + +createDataSource(...) : unique_ptr <> + +createDataSink(...) : unique_ptr <> + } + Connector <|.. TpchConnector + + class TpchConnectorFactory + ConnectorFactory <|.. TpchConnectorFactory + + } + + package fuzzer { + class FuzzerConnector { + +createDataSource(...) : unique_ptr <> + +createDataSink(...) : unique_ptr <> + } + Connector <|.. FuzzerConnector + + class FuzzerConnectorFactory + ConnectorFactory <|.. FuzzerConnectorFactory + + } + + + class ConnectorRegistry <> { + +registerConnectorFactory(name, factory) + +getConnectorFactory(name) : ConnectorFactory + } + + class GlobalFunctions { + +registerConnectorFactory(shared_ptr) : bool + +unregisterConnectorFactory(const string& connectorName) : bool + +registerConnector(shared_ptr connector) : bool + +unregisterConnector(shared_ptr connector) : bool + +getConnector(const std::string& connectorId) : shared_ptr + } + + ConnectorRegistry --> ConnectorFactory : holds registered factories + + } + + + package exec { + class HiveConnectorTestBase { + +serialize(): folly::dynamic <> + } + HiveConnectorTestBase --> HiveConnectorFactory : makeXXX(TableHandle, ColumnHandle, etc) + + + class ExecCode + ExecCode --> ConnectorRegistry : getConnectorFactory("hive") + ExecCode --> HiveConnectorFactory : newConnector(...) + } + +} + +package presto { + class PrestoServer { + + registerVeloxConnectors(const fs::path&) : vector <> + + registerSystemConnector() <> + + unregisterConnectors() <> + } + + class SystemConnector { + } + +} +Connector <|- SystemConnector + +@enduml diff --git a/velox/connectors/after_connector_plugin.puml b/velox/connectors/after_connector_plugin.puml new file mode 100644 index 000000000000..908a961e160b --- /dev/null +++ b/velox/connectors/after_connector_plugin.puml @@ -0,0 +1,243 @@ +@startuml BeforeConnectorDesign +'--- Connector interfaces and coupling in the current monolithic design +package velox { + package connector { + '--- New plugin‐based design with common factory/registry + + + + package common { + class connectorObjectFactories <> { + static factories + } + connectorObjectFactories --> ConnectorObjectFactory : holds registered factories + connectorObjectFactories -[hidden]down-> ConnectorObjectFactory + + class connectors <> { + static connectors + } + connectors --> Connector : holds registered connectors + connectors -[hidden]down-> ConnectorFactory + + class connectorFactories <> { + static factories + } + connectorFactories --> ConnectorFactory : holds registered factories + connectorFactories -[hidden]down-> ConnectorObjectFactory +' connectorFactories -[hidden]down-> Connector + connectorFactories -[hidden]down-> ConnectorSplit + connectorFactories -[hidden]down-> ConnectorColumnHandle + connectorFactories -[hidden]down-> ConnectorTableHandle + connectorFactories -[hidden]down-> ConnectorInsertTableHandle + connectorFactories -[hidden]down-> DataSource + connectorFactories -[hidden]down-> DataSink + + class ConnectorRegistryFunctions <> { + +registerConnectorFactory(...) : bool + +unregisterConnectorFactory(...) : bool + +getConnectorFactory() + +registerConnector(...) : bool + +unregisterConnector(...) : bool + +getConnector(...) : shared_ptr + } + + + + interface ConnectorFactory { + +newConnector(...) <> + } + ConnectorFactory --> Connector : newConnector + ConnectorFactory -[hidden]down-> Connector + ConnectorFactory -[hidden]down-> ConnectorSplit + ConnectorFactory -[hidden]down-> ConnectorColumnHandle +' ConnectorFactory -[hidden]down-> ConnectorTableHandle + ConnectorFactory -[hidden]down-> DataSource + ConnectorFactory -[hidden]down-> DataSink + + + interface ConnectorObjectFactory { + +makeConnectorSplit(...) + +makeTableHandle(...) + +makeInsertTableHandle(...) + +makeColumnHandle(...) + +makeLocationHandle(...) + } + ConnectorObjectFactory --> ConnectorSplit : makeConnectorSplit + ConnectorObjectFactory --> ConnectorTableHandle : makeTableHandle + ConnectorObjectFactory --> ConnectorInsertTableHandle : makeInsertTableHandle + ConnectorObjectFactory --> ConnectorColumnHandle : makeColumnHandle + ConnectorObjectFactory --> ConnectorLocationHandle : makeLocationHandle + ConnectorObjectFactory -[hidden]down-> ConnectorSplit + ConnectorObjectFactory -[hidden]down-> ConnectorColumnHandle + ConnectorObjectFactory -[hidden]down-> ConnectorTableHandle + ConnectorObjectFactory -[hidden]down-> ConnectorLocationHandle + + + + interface ConnectorSplit + interface ConnectorColumnHandle + interface ConnectorLocationHandle + interface ConnectorTableHandle + interface ConnectorInsertTableHandle + interface ConnectorLocationHandle + interface Connector + interface DataSource + interface DataSink + Connector --> DataSource : createDataSource + Connector --> DataSink : createDataSink + + + + } + + package readers { + class SplitReader + } + + package hive { + class HiveConnector + Connector <|.. HiveConnector + + class HiveConnectorFactory + ConnectorFactory <|.. HiveConnectorFactory + + class HiveConnectorSplit + ConnectorSplit <|.. HiveConnectorSplit + + class HiveColumnHandle + ConnectorColumnHandle <|.. HiveColumnHandle + + class HiveTableHandle + ConnectorTableHandle <|.. HiveTableHandle + + class HiveInsertTableHandle + ConnectorInsertTableHandle <|.. HiveInsertTableHandle + + class HiveDataSource + DataSource <|.. HiveDataSource + + class HiveDataSink + DataSink <|.. HiveDataSink + + class HiveSplitReader + SplitReader <|- HiveSplitReader + + HiveTableHandle -[hidden]right-> HiveSplitReader + HiveConnector -[hidden]right-> HiveSplitReader + } + + ConnectorSplit -[hidden]down-> hive +' +' package iceberg { +' class IcebergConnector +' Connector <|.. HiveConnector +' +' class IcebergConnectorFactory +' ConnectorFactory <|.. IcebergConnectorFactory +' +' class IcebergConnectorSplit +' ConnectorSplit <|.. IcebergConnectorSplit +' +' class IcebergColumnHandle +' ConnectorColumnHandle <|.. IcebergColumnHandle +' +' class IcebergTableHandle +' ConnectorTableHandle <|.. IcebergTableHandle +' +' class IcebergInsertTableHandle +' ConnectorInsertTableHandle <|.. IcebergInsertTableHandle +' +' class IcebergDataSource +' DataSource <|.. IcebergDataSource +' +' class IcebergDataSink +' DataSink <|.. IcebergDataSink +' +' class IcebergSplitReader +' SplitReader <|- IcebergSplitReader +' } +' SplitReader -[hidden]down-> iceberg +' HiveConnectorSplit -[hidden]down-> iceberg + + + package tpch { + class TpchConnector + Connector <|.. TpchConnector + + class TpchConnectorFactory + ConnectorFactory <|.. TpchConnectorFactory + + class TpchTableHandle + ConnectorTableHandle <|.. TpchTableHandle + + } + + package fuzzer { + class FuzzerConnector + Connector <|.. FuzzerConnector + + class FuzzerConnectorFactory + ConnectorFactory <|.. FuzzerConnectorFactory + + } + } + + + package exec { + class HiveConnectorTestBase { + +makeConnectorSplits() + +makeConnectorSplit() + +makeTableHandle() + +makeInsertTableHandle() + +makeColumnHandle() + +makeLocationHandle() + ConnectorObjectFactory* objectFactory_ + } + HiveConnectorTestBase --> connectorFactories : getConnectorFactory("hive") + HiveConnectorTestBase --> connectorFactories : unregisterConnectorFactory("hive") + HiveConnectorTestBase --> connectorObjectFactories : getConnectorObjectFactory("hive") +' HiveConnectorTestBase --> HiveConnectorSplit : makeHiveConnectorSplit() +' HiveConnectorTestBase --> HiveColumnHandle : makeColumnHandle() +' HiveConnectorTestBase --> HiveTableHandle : makeTableHandle() +' HiveConnectorTestBase --> HiveInsertTableHandle : makeInsertTableHandle() + + class XXXTest + XXXTest --> connectorFactories : getConnectorFactory("hive") + XXXTest --> connectorObjectFactories : getConnectorObjectFactory("hive") + XXXTest --> ConnectorFactory : newConnector(Hive) + XXXTest --> ConnectorFactory : newConnector(Hive) + + HiveConnectorTestBase <|-XXXTest + HiveConnectorTestBase -[hidden]down-> XXXTest + + class TableScan + TableScan --> connectors : getConnector + + class TableWriter + TableWriter --> connectors : getConnector + + TableScan -[hidden]down-> XXXTest +' TableWriter -[hidden]down-> XXXTest + } + + exec -[hidden]down-> hive + +} + + +package presto { + class PrestoServer { + + registerVeloxConnectors(const fs::path&) : vector <> + + registerSystemConnector() <> + + unregisterConnectors() <> + } + + class SystemConnector { + } + +} +Connector <|.. SystemConnector +PrestoServer --> ConnectorRegistryFunctions : getConnectorFactory(connectorName) +PrestoServer --> ConnectorRegistryFunctions : registerConnector + +@enduml diff --git a/velox/connectors/before_connector_registery.puml b/velox/connectors/before_connector_registery.puml new file mode 100644 index 000000000000..3f3c5769d89f --- /dev/null +++ b/velox/connectors/before_connector_registery.puml @@ -0,0 +1,186 @@ +@startuml BeforeConnectorDesign +'--- Connector interfaces and coupling in the current monolithic design +package velox { + package connector { + class connectorFactories <> { + static factories + } + connectorFactories --> ConnectorFactory : holds registered factories + + class connectors <> { + static connectors + } + connectors --> Connector : holds registered connectors + + class ConnectorRegistryFunctions <> { + +registerConnectorFactory(...) : bool + +unregisterConnectorFactory(...) : bool + +getConnectorFactory() + +registerConnector(...) : bool + +unregisterConnector(...) : bool + +getConnector(...) : shared_ptr + } + + interface ConnectorFactory { + +newConnector(...) <> + } + ConnectorFactory --> Connector : newConnector + + interface ConnectorSplit + interface ColumnHandle + interface ConnectorTableHandle + interface ConnectorInsertTableHandle + interface Connector + interface DataSource + interface DataSink + Connector --> DataSource : createDataSource() + Connector --> DataSink : createDataSink() + + + + + + package hive { + class HiveConnector + Connector <|.. HiveConnector + + class HiveConnectorFactory + ConnectorFactory <|.. HiveConnectorFactory + + class HiveConnectorSplit + ConnectorSplit <|.. HiveConnectorSplit + + class HiveColumnHandle + ColumnHandle <|.. HiveColumnHandle + + class HiveTableHandle + ConnectorTableHandle <|.. HiveTableHandle + + class HiveInsertTableHandle + ConnectorInsertTableHandle <|.. HiveInsertTableHandle + + class HiveDataSource + DataSource <|.. HiveDataSource + + class HiveDataSink{ + class HiveLocationHandle + } + DataSink <|.. HiveDataSink + + class SplitReader + + package iceberg { + class IcebergSplitReader + SplitReader <|- IcebergSplitReader + + class HiveIcebergSplit + HiveConnectorSplit <|- HiveIcebergSplit + + class IcebergInsertTableHandle + HiveInsertTableHandle <|- IcebergInsertTableHandle + } + + SplitReader -[hidden]down-> iceberg + HiveConnectorSplit -[hidden]down-> iceberg + HiveInsertTableHandle -[hidden]down-> iceberg + } + + + + + package tpch { + class TpchConnector + Connector <|.. TpchConnector + + class TpchConnectorFactory + ConnectorFactory <|.. TpchConnectorFactory + + class TpchTableHandle + ConnectorTableHandle <|.. TpchTableHandle + + } + + package fuzzer { + class FuzzerConnector + Connector <|.. FuzzerConnector + + class FuzzerConnectorFactory + ConnectorFactory <|.. FuzzerConnectorFactory + + } + + ConnectorSplit -[hidden]down-> hive + ColumnHandle -[hidden]down-> hive + ConnectorTableHandle -[hidden]down-> hive + ConnectorInsertTableHandle -[hidden]down-> hive + + ConnectorSplit -[hidden]down-> tpch + ColumnHandle -[hidden]down-> tpch + ConnectorTableHandle -[hidden]down-> tpch + ConnectorInsertTableHandle -[hidden]down-> tpch + + ConnectorSplit -[hidden]down-> fuzzer +' ColumnHandle -[hidden]down-> hive +' ConnectorTableHandle -[hidden]down-> hive + + } + + + package exec { + class HiveConnectorTestBase { + +makeHiveConnectorSplits() + +makeHiveConnectorSplit() + +makeTableHandle() + +makeHiveInsertTableHandle() + +makeColumnHandle() + +makeLocationHandle() + ConnectorObjectFactory* objectFactory_ + } + HiveConnectorTestBase --> connectorFactories : getConnectorFactory("hive") + HiveConnectorTestBase --> connectorFactories : unregisterConnectorFactory("hive") + HiveConnectorTestBase --> HiveConnectorSplit : makeHiveConnectorSplit() + HiveConnectorTestBase --> HiveColumnHandle : makeColumnHandle() + HiveConnectorTestBase --> HiveTableHandle : makeTableHandle() + HiveConnectorTestBase --> HiveInsertTableHandle : makeInsertTableHandle() + HiveConnectorTestBase --> HiveDataSink : makeLocationHandle() + + class XXXTest + XXXTest --> connectorFactories : getConnectorFactory("hive") + XXXTest --> ConnectorFactory : newConnector(Hive) +' + + HiveConnectorTestBase <|-XXXTest + HiveConnectorTestBase -[hidden]down-> XXXTest + + class TableScan + TableScan --> connectors : getConnector + + class TableWriter + TableWriter --> connectors : getConnector + + TableScan -[hidden]down-> XXXTest +' TableWriter -[hidden]down-> XXXTest + } + + exec -[hidden]down-> hive + +} +exec -[hidden]down-> connector + + +package presto { + class PrestoServer { + + registerVeloxConnectors(const fs::path&) : vector <> + + registerSystemConnector() <> + + unregisterConnectors() <> + } + + class SystemConnector { + } + +} +Connector <|.. SystemConnector +PrestoServer --> ConnectorRegistryFunctions : getConnectorFactory(connectorName) +PrestoServer --> ConnectorRegistryFunctions : registerConnector + +@enduml From 3826f378a786bab0c412a879093d6e5dcc366e3f Mon Sep 17 00:00:00 2001 From: yingsu00 Date: Thu, 22 May 2025 17:02:47 +0800 Subject: [PATCH 08/10] Move dwio related functions from HiveConnectorTestBase to OperatorTestBase To drop dependency of the exec tests on Hive connector, we want to move the dwio related functions from HiveConnectorTestBase to OperatorTestBase. The rest of HiveConnectorTestBase is dependent on Hive. --- velox/exec/tests/LimitTest.cpp | 1 + .../tests/utils/HiveConnectorTestBase.cpp | 107 ---------------- .../exec/tests/utils/HiveConnectorTestBase.h | 43 ------- velox/exec/tests/utils/OperatorTestBase.cpp | 121 ++++++++++++++++++ velox/exec/tests/utils/OperatorTestBase.h | 46 +++++++ .../tests/utils/AggregationTestBase.cpp | 15 --- 6 files changed, 168 insertions(+), 165 deletions(-) diff --git a/velox/exec/tests/LimitTest.cpp b/velox/exec/tests/LimitTest.cpp index d1be2199c006..3ef1242109d1 100644 --- a/velox/exec/tests/LimitTest.cpp +++ b/velox/exec/tests/LimitTest.cpp @@ -21,6 +21,7 @@ using namespace facebook::velox; using namespace facebook::velox::exec; using namespace facebook::velox::exec::test; +// requres writeToFile, makeHiveConnectorSplit class LimitTest : public HiveConnectorTestBase {}; TEST_F(LimitTest, basic) { diff --git a/velox/exec/tests/utils/HiveConnectorTestBase.cpp b/velox/exec/tests/utils/HiveConnectorTestBase.cpp index 94aea88367c2..42b06df0fe32 100644 --- a/velox/exec/tests/utils/HiveConnectorTestBase.cpp +++ b/velox/exec/tests/utils/HiveConnectorTestBase.cpp @@ -16,23 +16,10 @@ #include "velox/exec/tests/utils/HiveConnectorTestBase.h" -#include "velox/common/file/FileSystems.h" -#include "velox/common/file/tests/FaultyFileSystem.h" -#include "velox/dwio/common/tests/utils/BatchMaker.h" -#include "velox/dwio/dwrf/RegisterDwrfReader.h" -#include "velox/dwio/dwrf/RegisterDwrfWriter.h" -#include "velox/dwio/dwrf/reader/DwrfReader.h" -#include "velox/dwio/dwrf/writer/FlushPolicy.h" -#include "velox/dwio/dwrf/writer/Writer.h" #include "velox/exec/tests/utils/AssertQueryBuilder.h" namespace facebook::velox::exec::test { -HiveConnectorTestBase::HiveConnectorTestBase() { - filesystems::registerLocalFileSystem(); - tests::utils::registerFaultyFileSystem(); -} - void HiveConnectorTestBase::SetUp() { OperatorTestBase::SetUp(); connector::registerConnectorFactory( @@ -46,17 +33,12 @@ void HiveConnectorTestBase::SetUp() { std::unordered_map()), ioExecutor_.get()); connector::registerConnector(hiveConnector); - dwio::common::registerFileSinks(); - dwrf::registerDwrfReaderFactory(); - dwrf::registerDwrfWriterFactory(); } void HiveConnectorTestBase::TearDown() { // Make sure all pending loads are finished or cancelled before unregister // connector. ioExecutor_.reset(); - dwrf::unregisterDwrfReaderFactory(); - dwrf::unregisterDwrfWriterFactory(); connector::unregisterConnector(kHiveConnectorId); connector::unregisterConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName); @@ -73,95 +55,6 @@ void HiveConnectorTestBase::resetHiveConnector( connector::registerConnector(hiveConnector); } -void HiveConnectorTestBase::writeToFiles( - const std::vector& filePaths, - std::vector vectors) { - VELOX_CHECK_EQ(filePaths.size(), vectors.size()); - for (int i = 0; i < filePaths.size(); ++i) { - writeToFile(filePaths[i], std::vector{vectors[i]}); - } -} - -void HiveConnectorTestBase::writeToFile( - const std::string& filePath, - RowVectorPtr vector) { - writeToFile(filePath, std::vector{vector}); -} - -void HiveConnectorTestBase::writeToFile( - const std::string& filePath, - const std::vector& vectors, - std::shared_ptr config, - const std::function()>& - flushPolicyFactory) { - writeToFile( - filePath, - vectors, - std::move(config), - vectors[0]->type(), - flushPolicyFactory); -} - -void HiveConnectorTestBase::writeToFile( - const std::string& filePath, - const std::vector& vectors, - std::shared_ptr config, - const TypePtr& schema, - const std::function()>& - flushPolicyFactory) { - velox::dwrf::WriterOptions options; - options.config = config; - options.schema = schema; - auto fs = filesystems::getFileSystem(filePath, {}); - auto writeFile = fs->openFileForWrite( - filePath, - {.shouldCreateParentDirectories = true, - .shouldThrowOnFileAlreadyExists = false}); - auto sink = std::make_unique( - std::move(writeFile), filePath); - auto childPool = rootPool_->addAggregateChild("HiveConnectorTestBase.Writer"); - options.memoryPool = childPool.get(); - options.flushPolicyFactory = flushPolicyFactory; - - facebook::velox::dwrf::Writer writer{std::move(sink), options}; - for (size_t i = 0; i < vectors.size(); ++i) { - writer.write(vectors[i]); - } - writer.close(); -} - -void HiveConnectorTestBase::createDirectory(const std::string& directoryPath) { - auto fs = filesystems::getFileSystem(directoryPath, {}); - fs->mkdir(directoryPath); -} - -void HiveConnectorTestBase::removeDirectory(const std::string& directoryPath) { - auto fs = filesystems::getFileSystem(directoryPath, {}); - if (fs->exists(directoryPath)) { - fs->rmdir(directoryPath); - } -} - -void HiveConnectorTestBase::removeFile(const std::string& filePath) { - auto fs = filesystems::getFileSystem(filePath, {}); - if (fs->exists(filePath)) { - fs->remove(filePath); - } -} - -std::vector HiveConnectorTestBase::makeVectors( - const RowTypePtr& rowType, - int32_t numVectors, - int32_t rowsPerVector) { - std::vector vectors; - for (int32_t i = 0; i < numVectors; ++i) { - auto vector = std::dynamic_pointer_cast( - velox::test::BatchMaker::createBatch(rowType, rowsPerVector, *pool_)); - vectors.push_back(vector); - } - return vectors; -} - std::shared_ptr HiveConnectorTestBase::assertQuery( const core::PlanNodePtr& plan, const std::vector>& filePaths, diff --git a/velox/exec/tests/utils/HiveConnectorTestBase.h b/velox/exec/tests/utils/HiveConnectorTestBase.h index 2173acf133e7..c369af95defa 100644 --- a/velox/exec/tests/utils/HiveConnectorTestBase.h +++ b/velox/exec/tests/utils/HiveConnectorTestBase.h @@ -19,8 +19,6 @@ #include "velox/connectors/hive/HiveConnectorSplit.h" #include "velox/connectors/hive/HiveDataSink.h" #include "velox/connectors/hive/TableHandle.h" -#include "velox/dwio/dwrf/common/Config.h" -#include "velox/dwio/dwrf/writer/FlushPolicy.h" #include "velox/exec/Operator.h" #include "velox/exec/tests/utils/OperatorTestBase.h" #include "velox/exec/tests/utils/TempFilePath.h" @@ -35,53 +33,12 @@ using ColumnHandleMap = class HiveConnectorTestBase : public OperatorTestBase { public: - HiveConnectorTestBase(); - void SetUp() override; void TearDown() override; void resetHiveConnector( const std::shared_ptr& config); - void writeToFiles( - const std::vector& filePaths, - std::vector vectors); - - void writeToFile(const std::string& filePath, RowVectorPtr vector); - - void writeToFile( - const std::string& filePath, - const std::vector& vectors, - std::shared_ptr config = - std::make_shared(), - const std::function()>& - flushPolicyFactory = nullptr); - - void writeToFile( - const std::string& filePath, - const std::vector& vectors, - std::shared_ptr config, - const TypePtr& schema, - const std::function()>& - flushPolicyFactory = nullptr); - - // Creates a directory using matching file system based on directoryPath. - // No throw when directory already exists. - void createDirectory(const std::string& directoryPath); - - // Removes a directory using matching file system based on directoryPath. - // No op when directory does not exist. - void removeDirectory(const std::string& directoryPath); - - // Removes a file using matching file system based on filePath. - // No op when file does not exist. - void removeFile(const std::string& filePath); - - std::vector makeVectors( - const RowTypePtr& rowType, - int32_t numVectors, - int32_t rowsPerVector); - using OperatorTestBase::assertQuery; /// Assumes plan has a single TableScan node. diff --git a/velox/exec/tests/utils/OperatorTestBase.cpp b/velox/exec/tests/utils/OperatorTestBase.cpp index 297010acb4cc..7a905c23b3e9 100644 --- a/velox/exec/tests/utils/OperatorTestBase.cpp +++ b/velox/exec/tests/utils/OperatorTestBase.cpp @@ -18,9 +18,16 @@ #include "velox/common/base/PeriodicStatsReporter.h" #include "velox/common/caching/AsyncDataCache.h" #include "velox/common/file/FileSystems.h" +#include "velox/common/file/tests/FaultyFileSystem.h" #include "velox/common/memory/MallocAllocator.h" #include "velox/common/memory/SharedArbitrator.h" #include "velox/common/testutil/TestValue.h" +#include "velox/dwio/common/tests/utils/BatchMaker.h" +#include "velox/dwio/dwrf/RegisterDwrfReader.h" +#include "velox/dwio/dwrf/RegisterDwrfWriter.h" +#include "velox/dwio/dwrf/reader/DwrfReader.h" +#include "velox/dwio/dwrf/writer/FlushPolicy.h" +#include "velox/dwio/dwrf/writer/Writer.h" #include "velox/exec/tests/utils/LocalExchangeSource.h" #include "velox/functions/prestosql/aggregates/RegisterAggregateFunctions.h" #include "velox/functions/prestosql/registration/RegistrationFunctions.h" @@ -49,6 +56,9 @@ OperatorTestBase::OperatorTestBase() { vectorMaker_ = velox::test::VectorMaker(pool_.get()); parse::registerTypeResolver(); + + filesystems::registerLocalFileSystem(); + tests::utils::registerFaultyFileSystem(); } void OperatorTestBase::registerVectorSerde() { @@ -149,9 +159,16 @@ void OperatorTestBase::SetUp() { options.spillStatsIntervalMs = 2'000; startPeriodicStatsReporter(options); testingStartLocalExchangeSource(); + + dwio::common::registerFileSinks(); + dwrf::registerDwrfReaderFactory(); + dwrf::registerDwrfWriterFactory(); } void OperatorTestBase::TearDown() { + dwrf::unregisterDwrfReaderFactory(); + dwrf::unregisterDwrfWriterFactory(); + waitForAllTasksToBeDeleted(); stopPeriodicStatsReporter(); // There might be lingering exchange source on executor even after all tasks @@ -266,4 +283,108 @@ core::TypedExprPtr OperatorTestBase::parseExpr( EXPECT_FALSE(fs->exists(spillDirectoryStr)); } +void OperatorTestBase::writeToFiles( + const std::vector& filePaths, + std::vector vectors) { + VELOX_CHECK_EQ(filePaths.size(), vectors.size()); + for (int i = 0; i < filePaths.size(); ++i) { + writeToFile(filePaths[i], std::vector{vectors[i]}); + } +} + +void OperatorTestBase::writeToFile( + const std::string& filePath, + RowVectorPtr vector) { + writeToFile(filePath, std::vector{vector}); +} + +void OperatorTestBase::writeToFile( + const std::string& filePath, + const std::vector& vectors, + std::shared_ptr config, + const std::function()>& + flushPolicyFactory) { + writeToFile( + filePath, + vectors, + std::move(config), + vectors[0]->type(), + flushPolicyFactory); +} + +void OperatorTestBase::writeToFile( + const std::string& filePath, + const std::vector& vectors, + std::shared_ptr config, + const TypePtr& schema, + const std::function()>& + flushPolicyFactory) { + velox::dwrf::WriterOptions options; + options.config = config; + options.schema = schema; + auto fs = filesystems::getFileSystem(filePath, {}); + auto writeFile = fs->openFileForWrite( + filePath, + {.shouldCreateParentDirectories = true, + .shouldThrowOnFileAlreadyExists = false}); + auto sink = std::make_unique( + std::move(writeFile), filePath); + auto childPool = rootPool_->addAggregateChild("OperatorTestBase.Writer"); + options.memoryPool = childPool.get(); + options.flushPolicyFactory = flushPolicyFactory; + + facebook::velox::dwrf::Writer writer{std::move(sink), options}; + for (size_t i = 0; i < vectors.size(); ++i) { + writer.write(vectors[i]); + } + writer.close(); +} + +void OperatorTestBase::writeToFile( + const std::string& path, + const VectorPtr& vector, + memory::MemoryPool* pool) { + dwrf::WriterOptions options; + options.schema = vector->type(); + options.memoryPool = pool; + auto writeFile = std::make_unique(path, true, false); + auto sink = + std::make_unique(std::move(writeFile), path); + dwrf::Writer writer(std::move(sink), options); + writer.write(vector); + writer.close(); +} + +void OperatorTestBase::createDirectory(const std::string& directoryPath) { + auto fs = filesystems::getFileSystem(directoryPath, {}); + fs->mkdir(directoryPath); +} + +void OperatorTestBase::removeDirectory(const std::string& directoryPath) { + auto fs = filesystems::getFileSystem(directoryPath, {}); + if (fs->exists(directoryPath)) { + fs->rmdir(directoryPath); + } +} + +void OperatorTestBase::removeFile(const std::string& filePath) { + auto fs = filesystems::getFileSystem(filePath, {}); + if (fs->exists(filePath)) { + fs->remove(filePath); + } +} + +std::vector OperatorTestBase::makeVectors( + const RowTypePtr& rowType, + int32_t numVectors, + int32_t rowsPerVector) { + std::vector vectors; + for (int32_t i = 0; i < numVectors; ++i) { + auto vector = std::dynamic_pointer_cast( + velox::test::BatchMaker::createBatch(rowType, rowsPerVector, *pool_)); + vectors.push_back(vector); + } + return vectors; +} + } // namespace facebook::velox::exec::test diff --git a/velox/exec/tests/utils/OperatorTestBase.h b/velox/exec/tests/utils/OperatorTestBase.h index 639e387bab76..101e77807e6f 100644 --- a/velox/exec/tests/utils/OperatorTestBase.h +++ b/velox/exec/tests/utils/OperatorTestBase.h @@ -21,6 +21,8 @@ #include "velox/common/caching/SsdCache.h" #include "velox/core/Expressions.h" #include "velox/core/PlanNode.h" +#include "velox/dwio/dwrf/common/Config.h" +#include "velox/dwio/dwrf/writer/FlushPolicy.h" #include "velox/exec/HashProbe.h" #include "velox/exec/tests/utils/QueryAssertions.h" #include "velox/parse/ExpressionsParser.h" @@ -166,6 +168,50 @@ class OperatorTestBase : public virtual testing::Test, RowTypePtr rowType, const parse::ParseOptions& options = {}); + void writeToFiles( + const std::vector& filePaths, + std::vector vectors); + + void writeToFile(const std::string& filePath, RowVectorPtr vector); + + void writeToFile( + const std::string& filePath, + const std::vector& vectors, + std::shared_ptr config = + std::make_shared(), + const std::function()>& + flushPolicyFactory = nullptr); + + void writeToFile( + const std::string& filePath, + const std::vector& vectors, + std::shared_ptr config, + const TypePtr& schema, + const std::function()>& + flushPolicyFactory = nullptr); + + void writeToFile( + const std::string& path, + const VectorPtr& vector, + memory::MemoryPool* pool); + + // Creates a directory using matching file system based on directoryPath. + // No throw when directory already exists. + void createDirectory(const std::string& directoryPath); + + // Removes a directory using matching file system based on directoryPath. + // No op when directory does not exist. + void removeDirectory(const std::string& directoryPath); + + // Removes a file using matching file system based on filePath. + // No op when file does not exist. + void removeFile(const std::string& filePath); + + std::vector makeVectors( + const RowTypePtr& rowType, + int32_t numVectors, + int32_t rowsPerVector); + public: static void deleteTaskAndCheckSpillDirectory(std::shared_ptr& task); diff --git a/velox/functions/lib/aggregates/tests/utils/AggregationTestBase.cpp b/velox/functions/lib/aggregates/tests/utils/AggregationTestBase.cpp index e96b14c30b6e..de7114acc35e 100644 --- a/velox/functions/lib/aggregates/tests/utils/AggregationTestBase.cpp +++ b/velox/functions/lib/aggregates/tests/utils/AggregationTestBase.cpp @@ -608,21 +608,6 @@ void AggregationTestBase::testAggregationsWithCompanion( namespace { -void writeToFile( - const std::string& path, - const VectorPtr& vector, - memory::MemoryPool* pool) { - dwrf::WriterOptions options; - options.schema = vector->type(); - options.memoryPool = pool; - auto writeFile = std::make_unique(path, true, false); - auto sink = - std::make_unique(std::move(writeFile), path); - dwrf::Writer writer(std::move(sink), options); - writer.write(vector); - writer.close(); -} - template class ScopedChange { public: From 2b7056e9a2c8bf345d9da05194ae1b23330370e2 Mon Sep 17 00:00:00 2001 From: yingsu00 Date: Sun, 25 May 2025 17:39:21 +0800 Subject: [PATCH 09/10] Rename ColumnHandle to ConnectorColumnHandle To match with ConnectorTableHandle and ConnectorInsertTableHandle --- velox/connectors/Connector.cpp | 4 +- velox/connectors/Connector.h | 10 ++-- velox/connectors/fuzzer/FuzzerConnector.h | 2 +- velox/connectors/hive/HiveConnector.cpp | 2 +- velox/connectors/hive/HiveConnector.h | 2 +- velox/connectors/hive/HiveDataSource.cpp | 2 +- velox/connectors/hive/HiveDataSource.h | 2 +- velox/connectors/hive/TableHandle.cpp | 4 +- velox/connectors/hive/TableHandle.h | 4 +- .../hive/iceberg/tests/IcebergReadTest.cpp | 4 +- velox/connectors/tests/ConnectorTest.cpp | 3 +- velox/connectors/tpch/TpchConnector.cpp | 2 +- velox/connectors/tpch/TpchConnector.h | 6 +- velox/core/PlanNode.cpp | 11 ++-- velox/core/PlanNode.h | 20 ++++--- velox/core/tests/PlanNodeBuilderTest.cpp | 14 +++-- velox/core/tests/PlanNodeTest.cpp | 8 ++- .../tests/reader/ParquetTableScanTest.cpp | 40 +++++++------ velox/exec/AggregateCompanionAdapter.cpp | 4 +- velox/exec/IndexLookupJoin.h | 4 +- velox/exec/MergeJoin.h | 2 +- velox/exec/TableScan.h | 7 ++- velox/exec/fuzzer/WriterFuzzer.cpp | 12 +++- velox/exec/tests/AssertQueryBuilderTest.cpp | 2 +- velox/exec/tests/AsyncConnectorTest.cpp | 3 +- velox/exec/tests/HashJoinTest.cpp | 6 +- velox/exec/tests/IndexLookupJoinTest.cpp | 36 +++++++++--- velox/exec/tests/MergeJoinTest.cpp | 2 +- velox/exec/tests/TableScanTest.cpp | 56 +++++++++++++------ .../exec/tests/utils/HiveConnectorTestBase.h | 9 +-- .../tests/utils/IndexLookupJoinTestBase.cpp | 2 +- .../tests/utils/IndexLookupJoinTestBase.h | 2 +- velox/exec/tests/utils/PlanBuilder.cpp | 8 ++- velox/exec/tests/utils/PlanBuilder.h | 14 +++-- .../tests/utils/TestIndexStorageConnector.cpp | 4 +- .../tests/utils/TestIndexStorageConnector.h | 7 ++- velox/experimental/wave/exec/TableScan.h | 7 ++- velox/python/plan_builder/PyPlanBuilder.cpp | 4 +- velox/substrait/SubstraitToVeloxPlan.cpp | 4 +- .../trace/tests/TableScanReplayerTest.cpp | 4 +- 40 files changed, 210 insertions(+), 129 deletions(-) diff --git a/velox/connectors/Connector.cpp b/velox/connectors/Connector.cpp index cddbc31d8de1..6faa280a2e8c 100644 --- a/velox/connectors/Connector.cpp +++ b/velox/connectors/Connector.cpp @@ -152,13 +152,13 @@ CommitStrategy stringToCommitStrategy(const std::string& strategy) { } } -folly::dynamic ColumnHandle::serializeBase(std::string_view name) { +folly::dynamic ConnectorColumnHandle::serializeBase(std::string_view name) { folly::dynamic obj = folly::dynamic::object; obj["name"] = name; return obj; } -folly::dynamic ColumnHandle::serialize() const { +folly::dynamic ConnectorColumnHandle::serialize() const { return serializeBase("ColumnHandle"); } diff --git a/velox/connectors/Connector.h b/velox/connectors/Connector.h index 96b0e045830d..53ea387c468c 100644 --- a/velox/connectors/Connector.h +++ b/velox/connectors/Connector.h @@ -88,9 +88,9 @@ struct ConnectorSplit : public ISerializable { } }; -class ColumnHandle : public ISerializable { +class ConnectorColumnHandle : public ISerializable { public: - virtual ~ColumnHandle() = default; + virtual ~ConnectorColumnHandle() = default; virtual const std::string& name() const { VELOX_UNSUPPORTED(); @@ -102,7 +102,7 @@ class ColumnHandle : public ISerializable { static folly::dynamic serializeBase(std::string_view name); }; -using ColumnHandlePtr = std::shared_ptr; +using ConnectorColumnHandlePtr = std::shared_ptr; class ConnectorTableHandle : public ISerializable { public: @@ -555,7 +555,7 @@ class Connector { const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& columnHandles, ConnectorQueryCtx* connectorQueryCtx) = 0; /// Returns true if addSplit of DataSource can use 'dataSource' from @@ -616,7 +616,7 @@ class Connector { const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& columnHandles, ConnectorQueryCtx* connectorQueryCtx) { VELOX_UNSUPPORTED( "Connector {} does not support index source", connectorId()); diff --git a/velox/connectors/fuzzer/FuzzerConnector.h b/velox/connectors/fuzzer/FuzzerConnector.h index 64477b73ea36..33cc1f819fc6 100644 --- a/velox/connectors/fuzzer/FuzzerConnector.h +++ b/velox/connectors/fuzzer/FuzzerConnector.h @@ -113,7 +113,7 @@ class FuzzerConnector final : public Connector { const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& /*columnHandles*/, + std::shared_ptr>& /*columnHandles*/, ConnectorQueryCtx* connectorQueryCtx) override final { return std::make_unique( outputType, tableHandle, connectorQueryCtx->memoryPool()); diff --git a/velox/connectors/hive/HiveConnector.cpp b/velox/connectors/hive/HiveConnector.cpp index 4ef9e8f06179..176c1ac42a72 100644 --- a/velox/connectors/hive/HiveConnector.cpp +++ b/velox/connectors/hive/HiveConnector.cpp @@ -75,7 +75,7 @@ std::unique_ptr HiveConnector::createDataSource( const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& columnHandles, ConnectorQueryCtx* connectorQueryCtx) { return std::make_unique( outputType, diff --git a/velox/connectors/hive/HiveConnector.h b/velox/connectors/hive/HiveConnector.h index 937fdd850edf..546ecd5d2733 100644 --- a/velox/connectors/hive/HiveConnector.h +++ b/velox/connectors/hive/HiveConnector.h @@ -53,7 +53,7 @@ class HiveConnector : public Connector { const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& columnHandles, ConnectorQueryCtx* connectorQueryCtx) override; bool supportsSplitPreload() override { diff --git a/velox/connectors/hive/HiveDataSource.cpp b/velox/connectors/hive/HiveDataSource.cpp index 597f982c9056..a092e34f8904 100644 --- a/velox/connectors/hive/HiveDataSource.cpp +++ b/velox/connectors/hive/HiveDataSource.cpp @@ -61,7 +61,7 @@ HiveDataSource::HiveDataSource( const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& columnHandles, FileHandleFactory* fileHandleFactory, folly::Executor* executor, const ConnectorQueryCtx* connectorQueryCtx, diff --git a/velox/connectors/hive/HiveDataSource.h b/velox/connectors/hive/HiveDataSource.h index 9f79fc2106b5..01d496a6c521 100644 --- a/velox/connectors/hive/HiveDataSource.h +++ b/velox/connectors/hive/HiveDataSource.h @@ -40,7 +40,7 @@ class HiveDataSource : public DataSource { const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& columnHandles, FileHandleFactory* fileHandleFactory, folly::Executor* executor, const ConnectorQueryCtx* connectorQueryCtx, diff --git a/velox/connectors/hive/TableHandle.cpp b/velox/connectors/hive/TableHandle.cpp index 3f7c8b6f93d5..c8ba5625e8cf 100644 --- a/velox/connectors/hive/TableHandle.cpp +++ b/velox/connectors/hive/TableHandle.cpp @@ -53,7 +53,7 @@ HiveColumnHandle::ColumnType HiveColumnHandle::columnTypeFromName( } folly::dynamic HiveColumnHandle::serialize() const { - folly::dynamic obj = ColumnHandle::serializeBase("HiveColumnHandle"); + folly::dynamic obj = ConnectorColumnHandle::serializeBase("HiveColumnHandle"); obj["hiveColumnHandleName"] = name_; obj["columnType"] = columnTypeName(columnType_); obj["dataType"] = dataType_->serialize(); @@ -81,7 +81,7 @@ std::string HiveColumnHandle::toString() const { return out.str(); } -ColumnHandlePtr HiveColumnHandle::create(const folly::dynamic& obj) { +ConnectorColumnHandlePtr HiveColumnHandle::create(const folly::dynamic& obj) { auto name = obj["hiveColumnHandleName"].asString(); auto columnType = columnTypeFromName(obj["columnType"].asString()); auto dataType = ISerializable::deserialize(obj["dataType"]); diff --git a/velox/connectors/hive/TableHandle.h b/velox/connectors/hive/TableHandle.h index 2711da55a37a..0a6446061866 100644 --- a/velox/connectors/hive/TableHandle.h +++ b/velox/connectors/hive/TableHandle.h @@ -23,7 +23,7 @@ namespace facebook::velox::connector::hive { -class HiveColumnHandle : public ColumnHandle { +class HiveColumnHandle : public ConnectorColumnHandle { public: enum class ColumnType { kPartitionKey, @@ -114,7 +114,7 @@ class HiveColumnHandle : public ColumnHandle { folly::dynamic serialize() const override; - static ColumnHandlePtr create(const folly::dynamic& obj); + static ConnectorColumnHandlePtr create(const folly::dynamic& obj); static std::string columnTypeName(HiveColumnHandle::ColumnType columnType); diff --git a/velox/connectors/hive/iceberg/tests/IcebergReadTest.cpp b/velox/connectors/hive/iceberg/tests/IcebergReadTest.cpp index 5ed64da321f2..a3c64f55748f 100644 --- a/velox/connectors/hive/iceberg/tests/IcebergReadTest.cpp +++ b/velox/connectors/hive/iceberg/tests/IcebergReadTest.cpp @@ -721,7 +721,9 @@ TEST_F(HiveIcebergTest, testPartitionedRead) { splits.insert(splits.end(), icebergSplits.begin(), icebergSplits.end()); } - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignments; assignments.insert( {"c0", diff --git a/velox/connectors/tests/ConnectorTest.cpp b/velox/connectors/tests/ConnectorTest.cpp index 88690fdb2c75..cb40cb56219b 100644 --- a/velox/connectors/tests/ConnectorTest.cpp +++ b/velox/connectors/tests/ConnectorTest.cpp @@ -35,7 +35,8 @@ class TestConnector : public connector::Connector { const std::shared_ptr& /* tableHandle */, const std::unordered_map< std::string, - std::shared_ptr>& /* columnHandles */, + std::shared_ptr< + connector::ConnectorColumnHandle>>& /* columnHandles */, connector::ConnectorQueryCtx* connectorQueryCtx) override { VELOX_NYI(); } diff --git a/velox/connectors/tpch/TpchConnector.cpp b/velox/connectors/tpch/TpchConnector.cpp index b2317774fe74..3b4dcf7a8323 100644 --- a/velox/connectors/tpch/TpchConnector.cpp +++ b/velox/connectors/tpch/TpchConnector.cpp @@ -62,7 +62,7 @@ TpchDataSource::TpchDataSource( const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& columnHandles, velox::memory::MemoryPool* pool) : pool_(pool) { auto tpchTableHandle = diff --git a/velox/connectors/tpch/TpchConnector.h b/velox/connectors/tpch/TpchConnector.h index 64fc4abf1be0..992a96ef3500 100644 --- a/velox/connectors/tpch/TpchConnector.h +++ b/velox/connectors/tpch/TpchConnector.h @@ -26,7 +26,7 @@ class TpchConnector; // TPC-H column handle only needs the column name (all columns are generated in // the same way). -class TpchColumnHandle : public ColumnHandle { +class TpchColumnHandle : public ConnectorColumnHandle { public: explicit TpchColumnHandle(const std::string& name) : name_(name) {} @@ -75,7 +75,7 @@ class TpchDataSource : public DataSource { const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& columnHandles, velox::memory::MemoryPool* pool); void addSplit(std::shared_ptr split) override; @@ -142,7 +142,7 @@ class TpchConnector final : public Connector { const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& columnHandles, ConnectorQueryCtx* connectorQueryCtx) override final { return std::make_unique( outputType, diff --git a/velox/core/PlanNode.cpp b/velox/core/PlanNode.cpp index fd2b389d6d3b..4cea1590e704 100644 --- a/velox/core/PlanNode.cpp +++ b/velox/core/PlanNode.cpp @@ -1101,14 +1101,17 @@ PlanNodePtr TableScanNode::create(const folly::dynamic& obj, void* context) { ISerializable::deserialize( obj["tableHandle"], context)); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignments; for (const auto& pair : obj["assignments"]) { auto assign = pair["assign"].asString(); - auto columnHandle = ISerializable::deserialize( - pair["columnHandle"]); + auto columnHandle = + ISerializable::deserialize( + pair["columnHandle"]); assignments[assign] = - std::const_pointer_cast(columnHandle); + std::const_pointer_cast(columnHandle); } return std::make_shared( diff --git a/velox/core/PlanNode.h b/velox/core/PlanNode.h index eee6974b63ee..35ea5e31dfd1 100644 --- a/velox/core/PlanNode.h +++ b/velox/core/PlanNode.h @@ -880,7 +880,7 @@ class TableScanNode : public PlanNode { const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& assignments) + std::shared_ptr>& assignments) : PlanNode(id), outputType_(std::move(outputType)), tableHandle_(tableHandle), @@ -916,7 +916,7 @@ class TableScanNode : public PlanNode { Builder& assignments( std::unordered_map< std::string, - std::shared_ptr> assignments) { + std::shared_ptr> assignments) { assignments_ = std::move(assignments); return *this; } @@ -944,7 +944,7 @@ class TableScanNode : public PlanNode { tableHandle_; std::optional>> + std::shared_ptr>> assignments_; }; @@ -969,9 +969,10 @@ class TableScanNode : public PlanNode { return tableHandle_; } - const std:: - unordered_map>& - assignments() const { + const std::unordered_map< + std::string, + std::shared_ptr>& + assignments() const { return assignments_; } @@ -988,9 +989,10 @@ class TableScanNode : public PlanNode { const RowTypePtr outputType_; const std::shared_ptr tableHandle_; - const std:: - unordered_map> - assignments_; + const std::unordered_map< + std::string, + std::shared_ptr> + assignments_; }; using TableScanNodePtr = std::shared_ptr; diff --git a/velox/core/tests/PlanNodeBuilderTest.cpp b/velox/core/tests/PlanNodeBuilderTest.cpp index fcd397fc4c04..e630485b2fed 100644 --- a/velox/core/tests/PlanNodeBuilderTest.cpp +++ b/velox/core/tests/PlanNodeBuilderTest.cpp @@ -186,11 +186,12 @@ TEST_F(PlanNodeBuilderTest, TableScanNode) { const RowTypePtr outputType = ROW({"c0", "c1"}, {INTEGER(), VARCHAR()}); const auto tableHandle = std::make_shared("connector_id"); - const std:: - unordered_map> - assignments{ - {"c0", std::make_shared()}, - {"c1", std::make_shared()}}; + const std::unordered_map< + std::string, + std::shared_ptr> + assignments{ + {"c0", std::make_shared()}, + {"c1", std::make_shared()}}; const auto verify = [&](const std::shared_ptr& node) { EXPECT_EQ(node->id(), id); @@ -688,7 +689,8 @@ TEST_F(PlanNodeBuilderTest, IndexLookupJoinNode) { .outputType(ROW({"c1"}, {VARCHAR()})) .tableHandle(std::make_shared( "connector_id")) - .assignments({{"c1", std::make_shared()}}) + .assignments( + {{"c1", std::make_shared()}}) .build(); const auto outputType = ROW({"c0"}, {BIGINT()}); diff --git a/velox/core/tests/PlanNodeTest.cpp b/velox/core/tests/PlanNodeTest.cpp index b1026300c0dd..27723f7eef13 100644 --- a/velox/core/tests/PlanNodeTest.cpp +++ b/velox/core/tests/PlanNodeTest.cpp @@ -46,7 +46,9 @@ TEST_F(PlanNodeTest, findFirstNode) { auto rowType = ROW({"name1"}, {BIGINT()}); std::shared_ptr tableHandle; - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignments; std::shared_ptr tableScan3 = @@ -179,7 +181,7 @@ TEST_F(PlanNodeTest, isIndexLookupJoin) { nullptr, std::unordered_map< std::string, - std::shared_ptr>{}); + std::shared_ptr>{}); ASSERT_FALSE(isIndexLookupJoin(probeNode.get())); const auto buildNode = std::make_shared( "tableScan-build", @@ -187,7 +189,7 @@ TEST_F(PlanNodeTest, isIndexLookupJoin) { indexTableHandle, std::unordered_map< std::string, - std::shared_ptr>{}); + std::shared_ptr>{}); ASSERT_FALSE(isIndexLookupJoin(buildNode.get())); const std::vector leftKeys{ std::make_shared(BIGINT(), "c0")}; diff --git a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp index 63522267d61d..5741b349e695 100644 --- a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp @@ -69,8 +69,9 @@ class ParquetTableScanTest : public HiveConnectorTestBase { void assertSelectWithAssignments( std::vector&& outputColumnNames, - std::unordered_map>& - assignments, + std::unordered_map< + std::string, + std::shared_ptr>& assignments, const std::string& sql) { auto rowType = getRowType(std::move(outputColumnNames)); auto plan = PlanBuilder() @@ -86,7 +87,8 @@ class ParquetTableScanTest : public HiveConnectorTestBase { const std::string& sql, const std::unordered_map< std::string, - std::shared_ptr>& assignments = {}) { + std::shared_ptr>& assignments = + {}) { auto rowType = getRowType(std::move(outputColumnNames)); parse::ParseOptions options; options.parseDecimalAsDouble = false; @@ -719,7 +721,9 @@ TEST_F(ParquetTableScanTest, rowIndex) { }), std::nullopt, std::unordered_map{{kPath, filePath}}); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignments; assignments["a"] = std::make_shared( "a", @@ -803,9 +807,9 @@ TEST_F(ParquetTableScanTest, filterNullIcebergPartition) { std::unordered_map>{ {"c1", std::nullopt}}); - std::shared_ptr c0 = makeColumnHandle( + std::shared_ptr c0 = makeColumnHandle( "c0", BIGINT(), BIGINT(), {}, HiveColumnHandle::ColumnType::kRegular); - std::shared_ptr c1 = makeColumnHandle( + std::shared_ptr c1 = makeColumnHandle( "c1", BIGINT(), BIGINT(), @@ -817,7 +821,9 @@ TEST_F(ParquetTableScanTest, filterNullIcebergPartition) { {"c1 IS NOT NULL"}, "", "SELECT c0, c1 FROM tmp WHERE c1 IS NOT NULL", - std::unordered_map>{ + std::unordered_map< + std::string, + std::shared_ptr>{ {"c0", c0}, {"c1", c1}}); assertSelectWithFilter( @@ -825,7 +831,9 @@ TEST_F(ParquetTableScanTest, filterNullIcebergPartition) { {"c1 IS NULL"}, "", "SELECT c0, c1 FROM tmp WHERE c1 IS NULL", - std::unordered_map>{ + std::unordered_map< + std::string, + std::shared_ptr>{ {"c0", c0}, {"c1", c1}}); } @@ -1342,15 +1350,15 @@ TEST_F(ParquetTableScanTest, booleanRle) { writeToParquetFile(file->getPath(), {vector}, options); loadData(file->getPath(), schema, vector); - std::shared_ptr c0 = makeColumnHandle( + std::shared_ptr c0 = makeColumnHandle( "c0", BOOLEAN(), BOOLEAN(), {}, HiveColumnHandle::ColumnType::kRegular); - std::shared_ptr c1 = makeColumnHandle( + std::shared_ptr c1 = makeColumnHandle( "c1", BOOLEAN(), BOOLEAN(), {}, HiveColumnHandle::ColumnType::kRegular); - std::shared_ptr c2 = makeColumnHandle( + std::shared_ptr c2 = makeColumnHandle( "c2", BOOLEAN(), BOOLEAN(), {}, HiveColumnHandle::ColumnType::kRegular); - std::shared_ptr c3 = makeColumnHandle( + std::shared_ptr c3 = makeColumnHandle( "c3", BOOLEAN(), BOOLEAN(), {}, HiveColumnHandle::ColumnType::kRegular); - std::shared_ptr c4 = makeColumnHandle( + std::shared_ptr c4 = makeColumnHandle( "c4", BOOLEAN(), BOOLEAN(), {}, HiveColumnHandle::ColumnType::kRegular); assertSelect({"c0"}, "SELECT c0 FROM tmp"); @@ -1378,11 +1386,11 @@ TEST_F(ParquetTableScanTest, singleBooleanRle) { writeToParquetFile(file->getPath(), {vector}, options); loadData(file->getPath(), schema, vector); - std::shared_ptr c0 = makeColumnHandle( + std::shared_ptr c0 = makeColumnHandle( "c0", BOOLEAN(), BOOLEAN(), {}, HiveColumnHandle::ColumnType::kRegular); - std::shared_ptr c1 = makeColumnHandle( + std::shared_ptr c1 = makeColumnHandle( "c1", BOOLEAN(), BOOLEAN(), {}, HiveColumnHandle::ColumnType::kRegular); - std::shared_ptr c2 = makeColumnHandle( + std::shared_ptr c2 = makeColumnHandle( "c2", BOOLEAN(), BOOLEAN(), {}, HiveColumnHandle::ColumnType::kRegular); assertSelect({"c0"}, "SELECT c0 FROM tmp"); diff --git a/velox/exec/AggregateCompanionAdapter.cpp b/velox/exec/AggregateCompanionAdapter.cpp index f77400e952d0..4b0f253ac9a4 100644 --- a/velox/exec/AggregateCompanionAdapter.cpp +++ b/velox/exec/AggregateCompanionAdapter.cpp @@ -415,8 +415,8 @@ bool CompanionFunctionsRegistrar::registerMergeExtractFunction( bool registered = false; if (CompanionSignatures::hasSameIntermediateTypesAcrossSignatures( signatures)) { - registered |= - registerMergeExtractFunctionWithSuffix(name, signatures, metadata, overwrite); + registered |= registerMergeExtractFunctionWithSuffix( + name, signatures, metadata, overwrite); } auto mergeExtractSignatures = diff --git a/velox/exec/IndexLookupJoin.h b/velox/exec/IndexLookupJoin.h index 30860f032e89..142e2c62ea9a 100644 --- a/velox/exec/IndexLookupJoin.h +++ b/velox/exec/IndexLookupJoin.h @@ -195,7 +195,9 @@ class IndexLookupJoin : public Operator { const RowTypePtr lookupType_; const std::shared_ptr lookupTableHandle_; const std::vector lookupConditions_; - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> lookupColumnHandles_; const std::shared_ptr connectorQueryCtx_; const std::shared_ptr connector_; diff --git a/velox/exec/MergeJoin.h b/velox/exec/MergeJoin.h index 674f932d0678..6b5162ff1486 100644 --- a/velox/exec/MergeJoin.h +++ b/velox/exec/MergeJoin.h @@ -443,7 +443,7 @@ class MergeJoin : public Operator { } void reset(); - + bool isRightJoinForFullOuter(vector_size_t row) { return rawRightJoinRows_[row]; } diff --git a/velox/exec/TableScan.h b/velox/exec/TableScan.h index daebdd85cd1e..64c83c88f16f 100644 --- a/velox/exec/TableScan.h +++ b/velox/exec/TableScan.h @@ -99,9 +99,10 @@ class TableScan : public SourceOperator { void tryScaleUp(); const std::shared_ptr tableHandle_; - const std:: - unordered_map> - columnHandles_; + const std::unordered_map< + std::string, + std::shared_ptr> + columnHandles_; DriverCtx* const driverCtx_; const int32_t maxSplitPreloadPerDriver_{0}; const vector_size_t maxReadBatchSize_; diff --git a/velox/exec/fuzzer/WriterFuzzer.cpp b/velox/exec/fuzzer/WriterFuzzer.cpp index 79109f9dfa84..92e31590d5e0 100644 --- a/velox/exec/fuzzer/WriterFuzzer.cpp +++ b/velox/exec/fuzzer/WriterFuzzer.cpp @@ -149,7 +149,9 @@ class WriterFuzzer { const std::shared_ptr& outputDirectoryPath); // Generates table column handles based on table column properties - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> getTableColumnHandles( const std::vector& names, const std::vector& types, @@ -635,13 +637,17 @@ void WriterFuzzer::verifyWriter( LOG(INFO) << "Verified results against reference DB"; } -std::unordered_map> +std::unordered_map< + std::string, + std::shared_ptr> WriterFuzzer::getTableColumnHandles( const std::vector& names, const std::vector& types, const int32_t partitionOffset, const int32_t bucketCount) { - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> columnHandle; for (int i = 0; i < names.size(); ++i) { HiveColumnHandle::ColumnType columnType; diff --git a/velox/exec/tests/AssertQueryBuilderTest.cpp b/velox/exec/tests/AssertQueryBuilderTest.cpp index 573d7349e509..0a7d6e5df4cb 100644 --- a/velox/exec/tests/AssertQueryBuilderTest.cpp +++ b/velox/exec/tests/AssertQueryBuilderTest.cpp @@ -93,7 +93,7 @@ TEST_F(AssertQueryBuilderTest, hiveSplits) { .assertResults("VALUES (1), (2), (3)"); // Split with partition key. - ColumnHandleMap assignments = { + ConnectorColumnHandleMap assignments = { {"ds", partitionKey("ds", VARCHAR())}, {"c0", regularColumn("c0", BIGINT())}}; diff --git a/velox/exec/tests/AsyncConnectorTest.cpp b/velox/exec/tests/AsyncConnectorTest.cpp index 7f2ad55c00d6..4eaf4021b399 100644 --- a/velox/exec/tests/AsyncConnectorTest.cpp +++ b/velox/exec/tests/AsyncConnectorTest.cpp @@ -140,7 +140,8 @@ class TestConnector : public connector::Connector { const std::shared_ptr& /* tableHandle */, const std::unordered_map< std::string, - std::shared_ptr>& /* columnHandles */, + std::shared_ptr< + connector::ConnectorColumnHandle>>& /* columnHandles */, connector::ConnectorQueryCtx* connectorQueryCtx) override { return std::make_unique(connectorQueryCtx->memoryPool()); } diff --git a/velox/exec/tests/HashJoinTest.cpp b/velox/exec/tests/HashJoinTest.cpp index 4ff343979326..418dc3c2aa81 100644 --- a/velox/exec/tests/HashJoinTest.cpp +++ b/velox/exec/tests/HashJoinTest.cpp @@ -3721,7 +3721,7 @@ TEST_F(HashJoinTest, dynamicFilters) { // having different names than column names in the files. { auto scanOutputType = ROW({"a", "b"}, {INTEGER(), BIGINT()}); - ColumnHandleMap assignments; + ConnectorColumnHandleMap assignments; assignments["a"] = regularColumn("c0", INTEGER()); assignments["b"] = regularColumn("c1", BIGINT()); @@ -4500,7 +4500,7 @@ TEST_F(HashJoinTest, dynamicFiltersAppliedToPreloadedSplits) { } auto outputType = ROW({"p0", "p1"}, {BIGINT(), BIGINT()}); - ColumnHandleMap assignments = { + ConnectorColumnHandleMap assignments = { {"p0", regularColumn("p0", BIGINT())}, {"p1", partitionKey("p1", BIGINT())}}; createDuckDbTable("p", probeVectors); @@ -4894,7 +4894,7 @@ TEST_F(HashJoinTest, dynamicFilterOnPartitionKey) { .partitionKey("k", "0") .build(); auto outputType = ROW({"n1_0", "n1_1"}, {BIGINT(), BIGINT()}); - ColumnHandleMap assignments = { + ConnectorColumnHandleMap assignments = { {"n1_0", regularColumn("c0", BIGINT())}, {"n1_1", partitionKey("k", BIGINT())}}; diff --git a/velox/exec/tests/IndexLookupJoinTest.cpp b/velox/exec/tests/IndexLookupJoinTest.cpp index ac711764ce93..f411ad29df56 100644 --- a/velox/exec/tests/IndexLookupJoinTest.cpp +++ b/velox/exec/tests/IndexLookupJoinTest.cpp @@ -801,7 +801,9 @@ TEST_P(IndexLookupJoinTest, equalJoin) { const auto indexTableHandle = makeIndexTableHandle(indexTable, GetParam().asyncLookup); auto planNodeIdGenerator = std::make_shared(); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, @@ -1256,7 +1258,9 @@ TEST_P(IndexLookupJoinTest, betweenJoinCondition) { const auto indexTableHandle = makeIndexTableHandle(indexTable, GetParam().asyncLookup); auto planNodeIdGenerator = std::make_shared(); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, @@ -1577,7 +1581,9 @@ TEST_P(IndexLookupJoinTest, inJoinCondition) { const auto indexTableHandle = makeIndexTableHandle(indexTable, GetParam().asyncLookup); auto planNodeIdGenerator = std::make_shared(); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, @@ -1628,7 +1634,9 @@ DEBUG_ONLY_TEST_P(IndexLookupJoinTest, connectorError) { const auto indexTableHandle = makeIndexTableHandle(indexTable, GetParam().asyncLookup); auto planNodeIdGenerator = std::make_shared(); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, @@ -1697,7 +1705,9 @@ DEBUG_ONLY_TEST_P(IndexLookupJoinTest, prefetch) { const auto indexTableHandle = makeIndexTableHandle(indexTable, GetParam().asyncLookup); auto planNodeIdGenerator = std::make_shared(); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, @@ -1786,7 +1796,9 @@ TEST_P(IndexLookupJoinTest, outputBatchSize) { const auto indexTableHandle = makeIndexTableHandle(indexTable, GetParam().asyncLookup); auto planNodeIdGenerator = std::make_shared(); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, @@ -1855,7 +1867,9 @@ DEBUG_ONLY_TEST_P(IndexLookupJoinTest, runtimeStats) { const auto indexTableHandle = makeIndexTableHandle(indexTable, GetParam().asyncLookup); auto planNodeIdGenerator = std::make_shared(); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, @@ -1942,7 +1956,9 @@ TEST_P(IndexLookupJoinTest, barrier) { const auto indexTableHandle = makeIndexTableHandle(indexTable, GetParam().asyncLookup); auto planNodeIdGenerator = std::make_shared(); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, @@ -2017,7 +2033,9 @@ TEST_P(IndexLookupJoinTest, joinFuzzer) { std::random_device rd; std::mt19937 g(rd()); std::shuffle(scanOutput.begin(), scanOutput.end(), g); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, diff --git a/velox/exec/tests/MergeJoinTest.cpp b/velox/exec/tests/MergeJoinTest.cpp index 7323a1ca1dea..75acb1a079ff 100644 --- a/velox/exec/tests/MergeJoinTest.cpp +++ b/velox/exec/tests/MergeJoinTest.cpp @@ -1945,4 +1945,4 @@ TEST_F(MergeJoinTest, antiJoinWithTwoJoinKeysInDifferentBatch) { AssertQueryBuilder(plan, duckDbQueryRunner_) .assertResults( "SELECT * FROM t WHERE NOT exists (select * from u where t.a = u.c and t.b < u.d)"); -} \ No newline at end of file +} diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp index 749f47d6be54..b4d2b1060b80 100644 --- a/velox/exec/tests/TableScanTest.cpp +++ b/velox/exec/tests/TableScanTest.cpp @@ -196,7 +196,7 @@ class TableScanTest : public HiveConnectorTestBase { .build(); auto outputType = ROW({"pkey", "c0", "c1"}, {partitionType, BIGINT(), DOUBLE()}); - ColumnHandleMap assignments = { + ConnectorColumnHandleMap assignments = { {"pkey", partitionKey("pkey", partitionType)}, {"c0", regularColumn("c0", BIGINT())}, {"c1", regularColumn("c1", DOUBLE())}}; @@ -479,7 +479,7 @@ TEST_F(TableScanTest, partitionKeyAlias) { writeToFile(filePath->getPath(), vectors); createDuckDbTable(vectors); - ColumnHandleMap assignments = { + ConnectorColumnHandleMap assignments = { {"a", regularColumn("c0", BIGINT())}, {"ds_alias", partitionKey("ds", VARCHAR())}}; @@ -713,7 +713,9 @@ TEST_F(TableScanTest, subfieldPruningRowType) { writeToFile(filePath->getPath(), vectors); std::vector requiredSubfields; requiredSubfields.emplace_back("e.c"); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignments; assignments["e"] = std::make_shared( "e", @@ -770,7 +772,9 @@ TEST_F(TableScanTest, subfieldPruningRemainingFilterSubfieldsMissing) { writeToFile(filePath->getPath(), vectors); std::vector requiredSubfields; requiredSubfields.emplace_back("e.c"); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignments; assignments["e"] = std::make_shared( "e", @@ -826,7 +830,9 @@ TEST_F(TableScanTest, subfieldPruningRemainingFilterRootFieldMissing) { auto vectors = makeVectors(10, 1'000, rowType); auto filePath = TempFilePath::create(); writeToFile(filePath->getPath(), vectors); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignments; assignments["d"] = std::make_shared( "d", HiveColumnHandle::ColumnType::kRegular, BIGINT(), BIGINT()); @@ -869,7 +875,9 @@ TEST_F(TableScanTest, subfieldPruningRemainingFilterStruct) { for (int filterColumn = kWholeColumn; filterColumn <= kSubfieldOnly; ++filterColumn) { SCOPED_TRACE(fmt::format("{} {}", outputColumn, filterColumn)); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignments; assignments["d"] = std::make_shared( "d", HiveColumnHandle::ColumnType::kRegular, BIGINT(), BIGINT()); @@ -955,7 +963,9 @@ TEST_F(TableScanTest, subfieldPruningRemainingFilterMap) { for (int filterColumn = kWholeColumn; filterColumn <= kSubfieldOnly; ++filterColumn) { SCOPED_TRACE(fmt::format("{} {}", outputColumn, filterColumn)); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignments; assignments["a"] = std::make_shared( "a", HiveColumnHandle::ColumnType::kRegular, BIGINT(), BIGINT()); @@ -1055,7 +1065,9 @@ TEST_F(TableScanTest, subfieldPruningMapType) { requiredSubfields.emplace_back("c[0]"); requiredSubfields.emplace_back("c[2]"); requiredSubfields.emplace_back("c[4]"); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignments; assignments["c"] = std::make_shared( "c", @@ -1139,7 +1151,9 @@ TEST_F(TableScanTest, subfieldPruningArrayType) { writeToFile(filePath->getPath(), vectors); std::vector requiredSubfields; requiredSubfields.emplace_back("c[3]"); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignments; assignments["c"] = std::make_shared( "c", @@ -1302,7 +1316,7 @@ TEST_F(TableScanTest, missingColumns) { filters[common::Subfield("c1")] = lessThanOrEqualDouble(1050.0, true); auto tableHandle = std::make_shared( kHiveConnectorId, "tmp", true, std::move(filters), nullptr, dataColumns); - ColumnHandleMap assignments; + ConnectorColumnHandleMap assignments; assignments["c0"] = regularColumn("c0", BIGINT()); op = PlanBuilder(pool_.get()) .startTableScan() @@ -2143,7 +2157,7 @@ TEST_F(TableScanTest, partitionedTableDateKey) { .partitionKey("pkey", partitionValue) .build(); auto outputType = ROW({"pkey", "c0", "c1"}, {DATE(), BIGINT(), DOUBLE()}); - ColumnHandleMap assignments = { + ConnectorColumnHandleMap assignments = { {"pkey", partitionKey("pkey", DATE())}, {"c0", regularColumn("c0", BIGINT())}, {"c1", regularColumn("c1", DOUBLE())}}; @@ -2183,7 +2197,7 @@ TEST_F(TableScanTest, partitionedTableTimestampKey) { .partitionKey("pkey", partitionValue) .build(); - ColumnHandleMap assignments = { + ConnectorColumnHandleMap assignments = { {"pkey", partitionKey("pkey", TIMESTAMP())}, {"c0", regularColumn("c0", BIGINT())}, {"c1", regularColumn("c1", DOUBLE())}}; @@ -2511,7 +2525,8 @@ TEST_F(TableScanTest, statsBasedSkipping) { // c0 <= -1 -> whole file should be skipped based on stats auto subfieldFilters = singleSubfieldFilter("c0", lessThanOrEqual(-1)); - ColumnHandleMap assignments = {{"c1", regularColumn("c1", INTEGER())}}; + ConnectorColumnHandleMap assignments = { + {"c1", regularColumn("c1", INTEGER())}}; auto assertQuery = [&](const std::string& query) { auto tableHandle = makeTableHandle( @@ -3705,7 +3720,8 @@ TEST_F(TableScanTest, remainingFilter) { "SELECT * FROM tmp WHERE c1 > c0 AND c0 >= 0"); // Remaining filter uses columns that are not used otherwise. - ColumnHandleMap assignments = {{"c2", regularColumn("c2", DOUBLE())}}; + ConnectorColumnHandleMap assignments = { + {"c2", regularColumn("c2", DOUBLE())}}; assertQuery( PlanBuilder(pool_.get()) @@ -4314,7 +4330,8 @@ TEST_F(TableScanTest, interleaveLazyEager) { auto eagerFile = TempFilePath::create(); writeToFile(eagerFile->getPath(), rowsWithNulls); - ColumnHandleMap assignments = {{"c0", regularColumn("c0", column->type())}}; + ConnectorColumnHandleMap assignments = { + {"c0", regularColumn("c0", column->type())}}; CursorParameters params; params.planNode = PlanBuilder() .startTableScan() @@ -5127,7 +5144,7 @@ TEST_F(TableScanTest, varbinaryPartitionKey) { writeToFile(filePath->getPath(), vectors); createDuckDbTable(vectors); - ColumnHandleMap assignments = { + ConnectorColumnHandleMap assignments = { {"a", regularColumn("c0", BIGINT())}, {"ds_alias", partitionKey("ds", VARBINARY())}}; @@ -5184,7 +5201,8 @@ TEST_F(TableScanTest, timestampPartitionKey) { return splits; }; - ColumnHandleMap assignments = {{"t", partitionKey("t", TIMESTAMP())}}; + ConnectorColumnHandleMap assignments = { + {"t", partitionKey("t", TIMESTAMP())}}; auto plan = PlanBuilder() .startTableScan() .outputType(ROW({"t"}, {TIMESTAMP()})) @@ -5337,7 +5355,9 @@ TEST_F(TableScanTest, dynamicFilterWithRowIndexColumn) { {"row_index", "a"}, {makeFlatVector(5, folly::identity), makeFlatVector(5, folly::identity)}); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignments; assignments["a"] = std::make_shared( "a", diff --git a/velox/exec/tests/utils/HiveConnectorTestBase.h b/velox/exec/tests/utils/HiveConnectorTestBase.h index c369af95defa..e63df79a898e 100644 --- a/velox/exec/tests/utils/HiveConnectorTestBase.h +++ b/velox/exec/tests/utils/HiveConnectorTestBase.h @@ -28,8 +28,9 @@ namespace facebook::velox::exec::test { static const std::string kHiveConnectorId = "test-hive"; -using ColumnHandleMap = - std::unordered_map>; +using ConnectorColumnHandleMap = std::unordered_map< + std::string, + std::shared_ptr>; class HiveConnectorTestBase : public OperatorTestBase { public: @@ -193,8 +194,8 @@ class HiveConnectorTestBase : public OperatorTestBase { const std::string& name, const TypePtr& type); - static ColumnHandleMap allRegularColumns(const RowTypePtr& rowType) { - ColumnHandleMap assignments; + static ConnectorColumnHandleMap allRegularColumns(const RowTypePtr& rowType) { + ConnectorColumnHandleMap assignments; assignments.reserve(rowType->size()); for (uint32_t i = 0; i < rowType->size(); ++i) { const auto& name = rowType->nameOf(i); diff --git a/velox/exec/tests/utils/IndexLookupJoinTestBase.cpp b/velox/exec/tests/utils/IndexLookupJoinTestBase.cpp index 8ec2d9710745..fb2bf1b7f4fa 100644 --- a/velox/exec/tests/utils/IndexLookupJoinTestBase.cpp +++ b/velox/exec/tests/utils/IndexLookupJoinTestBase.cpp @@ -285,7 +285,7 @@ IndexLookupJoinTestBase::makeIndexScanNode( const facebook::velox::RowTypePtr& outputType, std::unordered_map< std::string, - std::shared_ptr>& + std::shared_ptr>& assignments) { auto planBuilder = facebook::velox::exec::test::PlanBuilder( planNodeIdGenerator, pool_.get()); diff --git a/velox/exec/tests/utils/IndexLookupJoinTestBase.h b/velox/exec/tests/utils/IndexLookupJoinTestBase.h index 4fa0c3472826..b8712b9508e3 100644 --- a/velox/exec/tests/utils/IndexLookupJoinTestBase.h +++ b/velox/exec/tests/utils/IndexLookupJoinTestBase.h @@ -127,7 +127,7 @@ class IndexLookupJoinTestBase const facebook::velox::RowTypePtr& outputType, std::unordered_map< std::string, - std::shared_ptr>& + std::shared_ptr>& assignments); /// Generate sequence storage table which will be persisted by mock zippydb diff --git a/velox/exec/tests/utils/PlanBuilder.cpp b/velox/exec/tests/utils/PlanBuilder.cpp index 1ddf46c61976..01101141cdbb 100644 --- a/velox/exec/tests/utils/PlanBuilder.cpp +++ b/velox/exec/tests/utils/PlanBuilder.cpp @@ -74,7 +74,7 @@ PlanBuilder& PlanBuilder::tableScan( const RowTypePtr& dataColumns, const std::unordered_map< std::string, - std::shared_ptr>& assignments) { + std::shared_ptr>& assignments) { return TableScanBuilder(*this) .filtersAsNode(filtersAsNode_ ? planNodeIdGenerator_ : nullptr) .outputType(outputType) @@ -94,7 +94,7 @@ PlanBuilder& PlanBuilder::tableScan( const RowTypePtr& dataColumns, const std::unordered_map< std::string, - std::shared_ptr>& assignments) { + std::shared_ptr>& assignments) { return TableScanBuilder(*this) .filtersAsNode(filtersAsNode_ ? planNodeIdGenerator_ : nullptr) .tableName(tableName) @@ -112,7 +112,9 @@ PlanBuilder& PlanBuilder::tpchTableScan( std::vector columnNames, double scaleFactor, std::string_view connectorId) { - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignmentsMap; std::vector outputTypes; diff --git a/velox/exec/tests/utils/PlanBuilder.h b/velox/exec/tests/utils/PlanBuilder.h index 0307d4dcf491..6fc72c163c4c 100644 --- a/velox/exec/tests/utils/PlanBuilder.h +++ b/velox/exec/tests/utils/PlanBuilder.h @@ -130,7 +130,7 @@ class PlanBuilder { /// types (for all columns) in this argument as opposed to 'outputType', where /// you define the output types only. See 'missingColumns' test in /// 'TableScanTest'. - /// @param assignments Optional ColumnHandles. + /// @param assignments Optional ConnectorColumnHandles. PlanBuilder& tableScan( const RowTypePtr& outputType, const std::vector& subfieldFilters = {}, @@ -138,7 +138,7 @@ class PlanBuilder { const RowTypePtr& dataColumns = nullptr, const std::unordered_map< std::string, - std::shared_ptr>& assignments = {}); + std::shared_ptr>& assignments = {}); /// Add a TableScanNode to scan a Hive table. /// @@ -170,7 +170,7 @@ class PlanBuilder { const RowTypePtr& dataColumns = nullptr, const std::unordered_map< std::string, - std::shared_ptr>& assignments = {}); + std::shared_ptr>& assignments = {}); /// Add a TableScanNode to scan a TPC-H table. /// @@ -277,14 +277,14 @@ class PlanBuilder { return *this; } - /// @param assignments Optional ColumnHandles. + /// @param assignments Optional ConnectorColumnHandles. /// outputType names should match the keys in the 'assignments' map. The /// 'assignments' map may contain more columns than 'outputType' if some /// columns are only used by pushed-down filters. TableScanBuilder& assignments( std::unordered_map< std::string, - std::shared_ptr> assignments) { + std::shared_ptr> assignments) { assignments_ = std::move(assignments); return *this; } @@ -308,7 +308,9 @@ class PlanBuilder { RowTypePtr dataColumns_; std::unordered_map columnAliases_; std::shared_ptr tableHandle_; - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignments_; // produce filters as a FilterNode instead of pushdown. diff --git a/velox/exec/tests/utils/TestIndexStorageConnector.cpp b/velox/exec/tests/utils/TestIndexStorageConnector.cpp index 3dfdfe04e3b2..760fb57be00c 100644 --- a/velox/exec/tests/utils/TestIndexStorageConnector.cpp +++ b/velox/exec/tests/utils/TestIndexStorageConnector.cpp @@ -33,7 +33,7 @@ core::TypedExprPtr toJoinConditionExpr( const RowTypePtr& inputType, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles) { + std::shared_ptr>& columnHandles) { if (joinConditions.empty()) { return nullptr; } @@ -463,7 +463,7 @@ std::shared_ptr TestIndexConnector::createIndexSource( const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& columnHandles, connector::ConnectorQueryCtx* connectorQueryCtx) { VELOX_CHECK_GE(inputType->size(), numJoinKeys + joinConditions.size()); auto testIndexTableHandle = diff --git a/velox/exec/tests/utils/TestIndexStorageConnector.h b/velox/exec/tests/utils/TestIndexStorageConnector.h index 08a1c5d69b94..795bc49be800 100644 --- a/velox/exec/tests/utils/TestIndexStorageConnector.h +++ b/velox/exec/tests/utils/TestIndexStorageConnector.h @@ -275,8 +275,9 @@ class TestIndexConnector : public connector::Connector { std::unique_ptr createDataSource( const RowTypePtr&, const std::shared_ptr&, - const std:: - unordered_map>&, + const std::unordered_map< + std::string, + std::shared_ptr>&, connector::ConnectorQueryCtx*) override { VELOX_UNSUPPORTED("{} not implemented", __FUNCTION__); } @@ -289,7 +290,7 @@ class TestIndexConnector : public connector::Connector { const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& columnHandles, connector::ConnectorQueryCtx* connectorQueryCtx) override; std::unique_ptr createDataSink( diff --git a/velox/experimental/wave/exec/TableScan.h b/velox/experimental/wave/exec/TableScan.h index c07530789494..dc935bcae71f 100644 --- a/velox/experimental/wave/exec/TableScan.h +++ b/velox/experimental/wave/exec/TableScan.h @@ -110,9 +110,10 @@ class TableScan : public WaveSourceOperator { static std::atomic ioWaitNanos_; const std::shared_ptr tableHandle_; - const std:: - unordered_map> - columnHandles_; + const std::unordered_map< + std::string, + std::shared_ptr> + columnHandles_; exec::DriverCtx* const driverCtx_; memory::MemoryPool* const connectorPool_; ContinueFuture blockingFuture_{ContinueFuture::makeEmpty()}; diff --git a/velox/python/plan_builder/PyPlanBuilder.cpp b/velox/python/plan_builder/PyPlanBuilder.cpp index c2509776f6d1..ab581fc0badd 100644 --- a/velox/python/plan_builder/PyPlanBuilder.cpp +++ b/velox/python/plan_builder/PyPlanBuilder.cpp @@ -138,7 +138,9 @@ PyPlanBuilder& PyPlanBuilder::tableScan( // If there are subfields, create the appropriate structures and add to the // scan. if (!subfields.empty() || !rowIndexColumnName.empty()) { - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignments; for (size_t i = 0; i < outputRowSchema->size(); ++i) { diff --git a/velox/substrait/SubstraitToVeloxPlan.cpp b/velox/substrait/SubstraitToVeloxPlan.cpp index 0a8ae48a5932..165da856597f 100644 --- a/velox/substrait/SubstraitToVeloxPlan.cpp +++ b/velox/substrait/SubstraitToVeloxPlan.cpp @@ -430,7 +430,9 @@ core::PlanNodePtr SubstraitVeloxPlanConverter::toVeloxPlan( // Get assignments and out names. std::vector outNames; outNames.reserve(colNameList.size()); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignments; for (int idx = 0; idx < colNameList.size(); idx++) { auto outName = substraitParser_->makeNodeName(planNodeId_, idx); diff --git a/velox/tool/trace/tests/TableScanReplayerTest.cpp b/velox/tool/trace/tests/TableScanReplayerTest.cpp index be504a2c3803..0aa8b708e25e 100644 --- a/velox/tool/trace/tests/TableScanReplayerTest.cpp +++ b/velox/tool/trace/tests/TableScanReplayerTest.cpp @@ -300,7 +300,9 @@ TEST_F(TableScanReplayerTest, subfieldPrunning) { writeToFile(filePath->getPath(), vectors); std::vector requiredSubfields; requiredSubfields.emplace_back("e.c"); - std::unordered_map> + std::unordered_map< + std::string, + std::shared_ptr> assignments; assignments["e"] = std::make_shared( "e", From b54bd5f4ff8a7dbe5d683a1079a475f71c2d6668 Mon Sep 17 00:00:00 2001 From: yingsu00 Date: Sun, 25 May 2025 14:58:44 +0800 Subject: [PATCH 10/10] Make connectors plugins --- velox/benchmarks/QueryBenchmarkBase.cpp | 10 +- velox/benchmarks/QueryBenchmarkBase.h | 2 +- velox/buffer/tests/BufferTest.cpp | 2 +- velox/common/base/AsyncSource.h | 4 +- velox/common/base/BloomFilter.h | 4 +- velox/common/base/SpillConfig.cpp | 2 +- velox/common/base/SpillConfig.h | 2 +- velox/common/base/SpillStats.h | 2 +- velox/common/caching/SsdFile.cpp | 6 +- .../compression/tests/CompressionTest.cpp | 2 +- velox/common/file/File.cpp | 4 +- velox/common/file/File.h | 2 +- velox/common/hyperloglog/DenseHll.cpp | 10 +- velox/common/hyperloglog/SparseHll.cpp | 6 +- .../hyperloglog/benchmarks/DenseHll.cpp | 6 +- velox/common/time/Timer.cpp | 2 +- velox/connectors/BeforeConnectorRefactor.puml | 2 +- velox/connectors/CMakeLists.txt | 2 +- velox/connectors/after_connector_plugin.puml | 2 +- .../before_connector_registery.puml | 2 +- velox/connectors/common/CMakeLists.txt | 44 + velox/connectors/{ => common}/Connector.cpp | 15 +- velox/connectors/{ => common}/Connector.h | 60 +- velox/connectors/common/ConnectorNames.h | 32 + .../common/ConnectorObjectFactory.cpp | 21 + .../common/ConnectorObjectFactory.h | 107 + velox/connectors/common/PluginLoader.cpp | 33 + velox/connectors/common/PluginLoader.h | 27 + velox/connectors/fuzzer/FuzzerConnector.cpp | 4 +- velox/connectors/fuzzer/FuzzerConnector.h | 42 +- .../connectors/fuzzer/FuzzerConnectorSplit.h | 6 +- .../fuzzer/tests/FuzzerConnectorTestBase.h | 10 +- velox/connectors/hive/CMakeLists.txt | 38 + velox/connectors/hive/HiveConnector.cpp | 18 +- velox/connectors/hive/HiveConnector.h | 38 +- velox/connectors/hive/HiveConnectorSplit.h | 6 +- velox/connectors/hive/HiveConnectorUtil.cpp | 86 +- velox/connectors/hive/HiveConnectorUtil.h | 26 +- velox/connectors/hive/HiveDataSink.cpp | 61 +- velox/connectors/hive/HiveDataSink.h | 191 +- velox/connectors/hive/HiveDataSource.cpp | 16 +- velox/connectors/hive/HiveDataSource.h | 36 +- velox/connectors/hive/HiveObjectFactory.cpp | 353 + velox/connectors/hive/HiveObjectFactory.h | 62 + velox/connectors/hive/HivePlugin.cpp | 37 + velox/connectors/hive/SplitReader.cpp | 22 +- velox/connectors/hive/SplitReader.h | 24 +- velox/connectors/hive/TableHandle.cpp | 24 +- velox/connectors/hive/TableHandle.h | 22 +- .../hive/iceberg/IcebergSplitReader.cpp | 6 +- .../hive/iceberg/IcebergSplitReader.h | 8 +- .../iceberg/PositionalDeleteFileReader.cpp | 6 +- .../hive/iceberg/PositionalDeleteFileReader.h | 6 +- .../hive/iceberg/tests/IcebergReadTest.cpp | 12 +- .../tests/IcebergSplitReaderBenchmark.cpp | 6 +- .../storage_adapters/abfs/AbfsFileSystem.cpp | 4 +- .../hive/storage_adapters/abfs/AbfsReadFile.h | 2 +- .../gcs/tests/GcsInsertTest.cpp | 10 +- .../s3fs/tests/S3InsertTest.cpp | 10 +- .../s3fs/tests/S3MultipleEndpointsTest.cpp | 20 +- .../s3fs/tests/S3ReadTest.cpp | 10 +- .../hive/tests/HiveConnectorSerDeTest.cpp | 8 +- .../hive/tests/HiveConnectorTest.cpp | 6 +- .../hive/tests/HiveConnectorUtilTest.cpp | 18 +- .../hive/tests/HiveDataSinkTest.cpp | 73 +- .../iceberg/IcebergObjectFactory.cpp | 62 + .../connectors/iceberg/IcebergObjectFactory.h | 16 + .../connectors/iceberg/IcebergTableHandle.cpp | 5 + velox/connectors/iceberg/IcebergTableHandle.h | 10 + velox/connectors/tests/ConnectorTest.cpp | 32 +- velox/connectors/tpch/TpchConnector.cpp | 6 +- velox/connectors/tpch/TpchConnector.h | 46 +- .../tpch/TpchConnectorObjectFactory.cpp | 341 + velox/connectors/tpch/TpchConnectorSplit.h | 6 +- velox/connectors/tpch/TpchPlugin.cpp | 36 + velox/connectors/tpch/tests/SpeedTest.cpp | 12 +- .../tpch/tests/TpchConnectorTest.cpp | 10 +- velox/core/ExpressionEvaluator.h | 2 +- velox/core/PlanNode.cpp | 18 +- velox/core/PlanNode.h | 42 +- velox/core/tests/PlanNodeBuilderTest.cpp | 18 +- velox/core/tests/PlanNodeTest.cpp | 12 +- velox/dwio/common/ColumnSelector.cpp | 14 +- velox/dwio/common/FileSink.h | 2 +- velox/dwio/common/Options.cpp | 87 + velox/dwio/common/Options.h | 5 +- velox/dwio/common/ScanSpec.cpp | 12 +- velox/dwio/common/ScanSpec.h | 12 +- .../common/SelectiveStructColumnReader.cpp | 2 +- velox/dwio/common/tests/ReaderTest.cpp | 20 +- .../common/tests/utils/DataSetBuilder.cpp | 8 +- .../dwio/common/tests/utils/DataSetBuilder.h | 22 +- .../common/tests/utils/E2EFilterTestBase.cpp | 42 +- .../common/tests/utils/E2EFilterTestBase.h | 16 +- velox/dwio/dwrf/common/ByteRLE.cpp | 24 +- velox/dwio/dwrf/common/ByteRLE.h | 8 +- velox/dwio/dwrf/common/Compression.h | 14 +- velox/dwio/dwrf/common/Config.cpp | 4 +- velox/dwio/dwrf/common/Config.h | 2 +- velox/dwio/dwrf/common/FileMetadata.cpp | 4 +- velox/dwio/dwrf/common/FileMetadata.h | 2 +- velox/dwio/dwrf/common/FloatingPointDecoder.h | 2 +- velox/dwio/dwrf/common/IntEncoder.h | 14 +- velox/dwio/dwrf/common/RLEv1.h | 16 +- velox/dwio/dwrf/reader/DwrfData.cpp | 2 +- velox/dwio/dwrf/reader/DwrfData.h | 4 +- velox/dwio/dwrf/reader/DwrfReader.cpp | 4 +- velox/dwio/dwrf/reader/DwrfReader.h | 2 +- velox/dwio/dwrf/reader/ReaderBase.cpp | 2 +- velox/dwio/dwrf/reader/ReaderBase.h | 6 +- .../reader/SelectiveByteRleColumnReader.h | 2 +- .../reader/SelectiveDecimalColumnReader.cpp | 42 +- .../reader/SelectiveDecimalColumnReader.h | 8 +- .../dwio/dwrf/reader/SelectiveDwrfReader.cpp | 4 +- velox/dwio/dwrf/reader/SelectiveDwrfReader.h | 4 +- .../reader/SelectiveFlatMapColumnReader.cpp | 22 +- .../reader/SelectiveFlatMapColumnReader.h | 2 +- .../SelectiveFloatingPointColumnReader.h | 4 +- ...SelectiveIntegerDictionaryColumnReader.cpp | 2 +- .../SelectiveIntegerDictionaryColumnReader.h | 2 +- .../SelectiveIntegerDirectColumnReader.h | 2 +- .../reader/SelectiveRepeatedColumnReader.cpp | 10 +- .../reader/SelectiveRepeatedColumnReader.h | 4 +- .../SelectiveStringDictionaryColumnReader.cpp | 2 +- .../SelectiveStringDictionaryColumnReader.h | 2 +- .../SelectiveStringDirectColumnReader.cpp | 6 +- .../SelectiveStringDirectColumnReader.h | 2 +- .../reader/SelectiveStructColumnReader.cpp | 2 +- .../dwrf/reader/SelectiveStructColumnReader.h | 4 +- .../reader/SelectiveTimestampColumnReader.cpp | 24 +- .../reader/SelectiveTimestampColumnReader.h | 6 +- .../dwio/dwrf/test/ColumnWriterIndexTest.cpp | 16 +- velox/dwio/dwrf/test/ColumnWriterTest.cpp | 20 +- velox/dwio/dwrf/test/E2EWriterTest.cpp | 16 +- .../dwrf/test/FloatColumnWriterBenchmark.cpp | 2 +- velox/dwio/dwrf/test/IntEncoderBenchmark.cpp | 2 +- velox/dwio/dwrf/test/LayoutPlannerTests.cpp | 2 +- velox/dwio/dwrf/test/ReaderTest.cpp | 46 +- velox/dwio/dwrf/test/TestByteRLEEncoder.cpp | 10 +- velox/dwio/dwrf/test/TestColumnReader.cpp | 16 +- velox/dwio/dwrf/test/TestIntDirect.cpp | 2 +- velox/dwio/dwrf/test/TestRLEv1Encoder.cpp | 28 +- .../dwrf/test/TestStatisticsBuilderUtils.cpp | 24 +- velox/dwio/dwrf/utils/ProtoUtils.cpp | 2 +- velox/dwio/dwrf/writer/ColumnWriter.cpp | 98 +- velox/dwio/dwrf/writer/ColumnWriter.h | 8 +- .../dwio/dwrf/writer/FlatMapColumnWriter.cpp | 16 +- velox/dwio/dwrf/writer/FlatMapColumnWriter.h | 16 +- .../dwrf/writer/IntegerDictionaryEncoder.h | 2 +- .../dwrf/writer/StatisticsBuilderUtils.cpp | 10 +- .../dwio/dwrf/writer/StatisticsBuilderUtils.h | 22 +- velox/dwio/dwrf/writer/Writer.cpp | 2 +- velox/dwio/dwrf/writer/Writer.h | 2 +- velox/dwio/dwrf/writer/WriterBase.cpp | 4 +- velox/dwio/dwrf/writer/WriterBase.h | 2 +- velox/dwio/dwrf/writer/WriterContext.cpp | 4 +- velox/dwio/dwrf/writer/WriterContext.h | 8 +- velox/dwio/orc/test/ReaderFilterTest.cpp | 10 +- velox/dwio/orc/test/ReaderTest.cpp | 12 +- .../dwio/parquet/reader/BooleanColumnReader.h | 2 +- .../reader/FloatingPointColumnReader.h | 4 +- .../dwio/parquet/reader/IntegerColumnReader.h | 2 +- velox/dwio/parquet/reader/Metadata.cpp | 14 +- velox/dwio/parquet/reader/Metadata.h | 2 +- velox/dwio/parquet/reader/PageReader.cpp | 2 +- velox/dwio/parquet/reader/PageReader.h | 18 +- .../parquet/reader/ParquetColumnReader.cpp | 2 +- .../dwio/parquet/reader/ParquetColumnReader.h | 2 +- velox/dwio/parquet/reader/ParquetData.cpp | 8 +- velox/dwio/parquet/reader/ParquetData.h | 6 +- .../parquet/reader/RepeatedColumnReader.cpp | 4 +- .../parquet/reader/RepeatedColumnReader.h | 4 +- velox/dwio/parquet/reader/RleBpDataDecoder.h | 2 +- .../parquet/reader/StringColumnReader.cpp | 2 +- .../dwio/parquet/reader/StringColumnReader.h | 2 +- .../parquet/reader/StructColumnReader.cpp | 2 +- .../dwio/parquet/reader/StructColumnReader.h | 2 +- .../parquet/reader/TimestampColumnReader.h | 6 +- velox/dwio/parquet/tests/ParquetTpchTest.cpp | 20 +- .../parquet/tests/reader/E2EFilterTest.cpp | 8 +- .../tests/reader/ParquetPageReaderTest.cpp | 6 +- .../tests/reader/ParquetTableScanTest.cpp | 36 +- .../tests/writer/ParquetWriterTest.cpp | 6 +- velox/dwio/parquet/writer/Writer.cpp | 20 +- velox/dwio/parquet/writer/Writer.h | 4 +- velox/examples/ScanAndSort.cpp | 10 +- velox/exec/Driver.cpp | 8 +- velox/exec/Driver.h | 6 +- velox/exec/Exchange.cpp | 2 +- velox/exec/Exchange.h | 4 +- velox/exec/GroupingSet.cpp | 14 +- velox/exec/GroupingSet.h | 18 +- velox/exec/HashBuild.cpp | 4 +- velox/exec/HashBuild.h | 4 +- velox/exec/HashJoinBridge.cpp | 10 +- velox/exec/HashJoinBridge.h | 6 +- velox/exec/IndexLookupJoin.cpp | 6 +- velox/exec/IndexLookupJoin.h | 14 +- velox/exec/Merge.cpp | 2 +- velox/exec/MergeSource.cpp | 4 +- velox/exec/Operator.cpp | 12 +- velox/exec/Operator.h | 18 +- velox/exec/OperatorTraceReader.cpp | 2 +- velox/exec/OperatorTraceReader.h | 4 +- velox/exec/OperatorTraceWriter.h | 2 +- velox/exec/OutputBuffer.cpp | 4 +- velox/exec/PartitionStreamingWindowBuild.cpp | 2 +- velox/exec/PartitionStreamingWindowBuild.h | 4 +- velox/exec/PartitionedOutput.cpp | 2 +- velox/exec/RowNumber.cpp | 4 +- velox/exec/RowNumber.h | 4 +- velox/exec/RowsStreamingWindowBuild.cpp | 2 +- velox/exec/RowsStreamingWindowBuild.h | 4 +- velox/exec/SortBuffer.cpp | 4 +- velox/exec/SortBuffer.h | 10 +- velox/exec/SortWindowBuild.cpp | 8 +- velox/exec/SortWindowBuild.h | 12 +- velox/exec/Spill.cpp | 16 +- velox/exec/Spill.h | 22 +- velox/exec/SpillFile.cpp | 26 +- velox/exec/SpillFile.h | 28 +- velox/exec/Spiller.cpp | 22 +- velox/exec/Spiller.h | 24 +- velox/exec/Split.h | 6 +- velox/exec/TableScan.cpp | 14 +- velox/exec/TableScan.h | 18 +- velox/exec/TableWriter.cpp | 6 +- velox/exec/TableWriter.h | 16 +- velox/exec/Task.cpp | 41 +- velox/exec/Task.h | 10 +- velox/exec/TaskStructs.h | 2 +- velox/exec/VectorHasher.cpp | 4 +- velox/exec/VectorHasher.h | 2 +- velox/exec/Window.cpp | 2 +- velox/exec/WindowBuild.cpp | 2 +- velox/exec/WindowBuild.h | 6 +- velox/exec/benchmarks/PrefixSortBenchmark.cpp | 4 +- velox/exec/fuzzer/AggregationFuzzer.cpp | 2 +- velox/exec/fuzzer/AggregationFuzzerBase.cpp | 2 +- velox/exec/fuzzer/AggregationFuzzerBase.h | 2 +- velox/exec/fuzzer/FuzzerUtil.cpp | 8 +- velox/exec/fuzzer/FuzzerUtil.h | 2 +- velox/exec/fuzzer/JoinFuzzer.cpp | 6 +- velox/exec/fuzzer/MemoryArbitrationFuzzer.cpp | 6 +- velox/exec/fuzzer/WriterFuzzer.cpp | 6 +- velox/exec/fuzzer/WriterFuzzerRunner.h | 6 +- .../tests/AddressableNonNullValueListTest.cpp | 2 +- .../tests/AggregateSpillBenchmarkBase.cpp | 2 +- velox/exec/tests/AggregationTest.cpp | 2 +- velox/exec/tests/AssertQueryBuilderTest.cpp | 16 +- velox/exec/tests/AsyncConnectorTest.cpp | 50 +- .../tests/ConcatFilesSpillMergeStreamTest.cpp | 8 +- velox/exec/tests/ContainerRowSerdeTest.cpp | 2 +- velox/exec/tests/ExchangeClientTest.cpp | 4 +- velox/exec/tests/HashJoinBridgeTest.cpp | 2 +- velox/exec/tests/HashJoinTest.cpp | 10 +- velox/exec/tests/HashTableTest.cpp | 2 +- velox/exec/tests/IndexLookupJoinTest.cpp | 32 +- .../tests/JoinSpillInputBenchmarkBase.cpp | 2 +- velox/exec/tests/MultiFragmentTest.cpp | 78 +- velox/exec/tests/OperatorTraceTest.cpp | 4 +- velox/exec/tests/OrderByTest.cpp | 2 +- velox/exec/tests/PlanNodeSerdeTest.cpp | 4 +- velox/exec/tests/PrefixSortTest.cpp | 4 +- velox/exec/tests/SortBufferTest.cpp | 22 +- velox/exec/tests/SpillTest.cpp | 22 +- velox/exec/tests/SpillerBenchmarkBase.h | 2 +- velox/exec/tests/SpillerTest.cpp | 32 +- velox/exec/tests/TableEvolutionFuzzerTest.cpp | 6 +- velox/exec/tests/TableScanTest.cpp | 82 +- velox/exec/tests/TableWriterTest.cpp | 7210 +++++++++-------- velox/exec/tests/UnnestTest.cpp | 2 +- velox/exec/tests/VeloxIn10MinDemo.cpp | 12 +- velox/exec/tests/WindowTest.cpp | 8 +- velox/exec/tests/utils/AssertQueryBuilder.cpp | 8 +- velox/exec/tests/utils/AssertQueryBuilder.h | 8 +- velox/exec/tests/utils/CMakeLists.txt | 3 +- velox/exec/tests/utils/HashJoinTestBase.h | 6 +- .../tests/utils/HiveConnectorTestBase.cpp | 339 +- .../exec/tests/utils/HiveConnectorTestBase.h | 160 +- .../tests/utils/IndexLookupJoinTestBase.cpp | 4 +- .../tests/utils/IndexLookupJoinTestBase.h | 6 +- .../exec/tests/utils/LocalExchangeSource.cpp | 6 +- .../exec/tests/utils/LocalRunnerTestBase.cpp | 19 +- velox/exec/tests/utils/OperatorTestBase.cpp | 2 +- velox/exec/tests/utils/OperatorTestBase.h | 6 +- velox/exec/tests/utils/PlanBuilder.cpp | 18 +- velox/exec/tests/utils/PlanBuilder.h | 22 +- velox/exec/tests/utils/QueryAssertions.h | 6 +- .../exec/tests/utils/TableWriterTestBase.cpp | 2188 ++--- velox/exec/tests/utils/TableWriterTestBase.h | 745 +- .../tests/utils/TestIndexStorageConnector.cpp | 20 +- .../tests/utils/TestIndexStorageConnector.h | 42 +- velox/experimental/wave/dwio/ColumnReader.h | 2 +- velox/experimental/wave/dwio/FormatData.cpp | 2 +- velox/experimental/wave/dwio/FormatData.h | 2 +- velox/experimental/wave/exec/TableScan.cpp | 10 +- velox/experimental/wave/exec/TableScan.h | 20 +- velox/experimental/wave/exec/ToWave.cpp | 2 +- velox/experimental/wave/exec/ToWave.h | 10 +- velox/experimental/wave/exec/Wave.cpp | 4 +- velox/experimental/wave/exec/Wave.h | 8 +- velox/experimental/wave/exec/WaveDataSource.h | 8 +- velox/experimental/wave/exec/WaveDriver.cpp | 2 +- velox/experimental/wave/exec/WaveDriver.h | 2 +- .../wave/exec/WaveHiveDataSource.cpp | 20 +- .../wave/exec/WaveHiveDataSource.h | 14 +- velox/experimental/wave/exec/WaveOperator.h | 6 +- velox/experimental/wave/exec/WavePlan.cpp | 4 +- .../wave/exec/WaveSplitReader.cpp | 2 +- .../experimental/wave/exec/WaveSplitReader.h | 10 +- .../wave/exec/tests/WaveBenchmark.cpp | 6 +- .../wave/exec/tests/utils/FileFormat.h | 6 +- .../exec/tests/utils/TestFormatReader.cpp | 10 +- .../wave/exec/tests/utils/TestFormatReader.h | 4 +- .../exec/tests/utils/WaveTestSplitReader.cpp | 6 +- .../exec/tests/utils/WaveTestSplitReader.h | 4 +- velox/expression/Expr.cpp | 34 +- velox/expression/Expr.h | 4 +- velox/expression/ExprToSubfieldFilter.cpp | 54 +- velox/expression/ExprToSubfieldFilter.h | 50 +- velox/expression/LambdaExpr.cpp | 2 +- velox/expression/LambdaExpr.h | 2 +- .../fuzzer/ExpressionFuzzerVerifier.cpp | 2 +- velox/expression/tests/ArrayWriterTest.cpp | 2 +- .../tests/ExprToSubfieldFilterTest.cpp | 4 +- velox/expression/tests/ExpressionRunner.cpp | 2 +- .../expression/tests/ExpressionRunnerTest.cpp | 2 +- velox/expression/tests/ExpressionVerifier.cpp | 2 +- velox/expression/tests/GenericViewTest.cpp | 4 +- velox/expression/tests/TryExprTest.cpp | 2 +- .../lib/aggregates/DecimalAggregate.h | 4 +- .../noisy_aggregation/NoisyCountAccumulator.h | 4 +- .../tests/utils/AggregationTestBase.cpp | 10 +- .../lib/tests/QuantileDigestTest.cpp | 8 +- velox/functions/lib/tests/RepeatTest.cpp | 2 +- velox/functions/lib/tests/TDigestTest.cpp | 8 +- velox/functions/prestosql/BinaryFunctions.h | 2 +- .../prestosql/HyperLogLogFunctions.h | 12 +- velox/functions/prestosql/InPredicate.cpp | 20 +- .../aggregates/ApproxDistinctAggregate.cpp | 4 +- .../aggregates/ClassificationAggregation.cpp | 6 +- .../aggregates/HyperLogLogAggregate.h | 12 +- .../prestosql/aggregates/MergeAggregate.cpp | 2 +- .../aggregates/tests/ApproxDistinctTest.cpp | 58 +- .../tests/ChecksumAggregateTest.cpp | 2 +- .../tests/ClassificationAggregationTest.cpp | 2 +- .../tests/MapUnionAggregationTest.cpp | 2 +- .../aggregates/tests/MapUnionSumTest.cpp | 8 +- .../aggregates/tests/PrestoHasherTest.cpp | 4 +- .../fuzzer/ApproxDistinctResultVerifier.h | 2 +- .../prestosql/tests/ArrayCombinationsTest.cpp | 12 +- .../prestosql/tests/ArrayIntersectTest.cpp | 18 +- .../prestosql/tests/ComparisonsTest.cpp | 8 +- .../prestosql/tests/InPredicateTest.cpp | 2 +- .../prestosql/tests/JsonCastTest.cpp | 8 +- .../prestosql/tests/JsonFunctionsTest.cpp | 2 +- .../prestosql/tests/MapFilterTest.cpp | 2 +- velox/functions/prestosql/tests/ZipTest.cpp | 2 +- .../sparksql/tests/ArraySortTestData.h | 6 +- velox/functions/sparksql/tests/HashTest.cpp | 2 +- .../functions/sparksql/tests/XxHash64Test.cpp | 2 +- velox/python/init/PyInit.cpp | 2 +- velox/python/plan_builder/PyPlanBuilder.cpp | 8 +- velox/python/plan_builder/PyPlanBuilder.h | 2 +- velox/python/runner/PyConnectors.cpp | 10 +- velox/row/tests/CompactRowTest.cpp | 10 +- velox/runner/LocalRunner.h | 10 +- velox/runner/Runner.h | 4 +- .../PrestoBatchVectorSerializer.cpp | 2 +- .../serializers/PrestoBatchVectorSerializer.h | 2 +- .../PrestoIterativeVectorSerializer.cpp | 6 +- velox/serializers/PrestoSerializer.cpp | 8 +- velox/serializers/PrestoSerializer.h | 2 +- .../PrestoSerializerDeserializationUtils.cpp | 2 +- velox/serializers/RowSerializer.h | 6 +- .../tests/CompactRowSerializerTest.cpp | 8 +- .../tests/PrestoSerializerTest.cpp | 28 +- .../tests/UnsafeRowSerializerTest.cpp | 8 +- velox/substrait/SubstraitToVeloxPlan.cpp | 10 +- velox/substrait/SubstraitToVeloxPlan.h | 2 +- .../Substrait2VeloxPlanConversionTest.cpp | 4 +- velox/tool/trace/TableWriterReplayer.cpp | 6 +- velox/tool/trace/TraceReplayRunner.cpp | 12 +- .../trace/tests/AggregationReplayerTest.cpp | 4 +- .../trace/tests/FilterProjectReplayerTest.cpp | 4 +- .../tool/trace/tests/HashJoinReplayerTest.cpp | 4 +- .../tests/PartitionedOutputReplayerTest.cpp | 4 +- .../trace/tests/TableScanReplayerTest.cpp | 8 +- .../trace/tests/TableWriterReplayerTest.cpp | 25 +- velox/tool/trace/tests/TraceFileToolTest.cpp | 4 +- velox/type/tests/FilterSerDeTest.cpp | 2 +- .../type/tests/NegatedBytesRangeBenchmark.cpp | 4 +- .../tests/NegatedBytesValuesBenchmark.cpp | 4 +- velox/type/tests/SubfieldFiltersBuilder.h | 6 +- velox/type/tests/TimestampTest.cpp | 4 +- .../tests/TimeZoneMapExternalInvalidTest.cpp | 2 +- velox/vector/VectorStream.h | 6 +- velox/vector/tests/EncodedVectorCopyTest.cpp | 2 +- velox/vector/tests/VectorMakerTest.cpp | 6 +- velox/vector/tests/VectorSaverTest.cpp | 2 +- velox/vector/tests/VectorTest.cpp | 24 +- 402 files changed, 8878 insertions(+), 7218 deletions(-) create mode 100644 velox/connectors/common/CMakeLists.txt rename velox/connectors/{ => common}/Connector.cpp (93%) rename velox/connectors/{ => common}/Connector.h (93%) create mode 100644 velox/connectors/common/ConnectorNames.h create mode 100644 velox/connectors/common/ConnectorObjectFactory.cpp create mode 100644 velox/connectors/common/ConnectorObjectFactory.h create mode 100644 velox/connectors/common/PluginLoader.cpp create mode 100644 velox/connectors/common/PluginLoader.h create mode 100644 velox/connectors/hive/HiveObjectFactory.cpp create mode 100644 velox/connectors/hive/HiveObjectFactory.h create mode 100644 velox/connectors/hive/HivePlugin.cpp create mode 100644 velox/connectors/iceberg/IcebergObjectFactory.cpp create mode 100644 velox/connectors/iceberg/IcebergObjectFactory.h create mode 100644 velox/connectors/iceberg/IcebergTableHandle.cpp create mode 100644 velox/connectors/iceberg/IcebergTableHandle.h create mode 100644 velox/connectors/tpch/TpchConnectorObjectFactory.cpp create mode 100644 velox/connectors/tpch/TpchPlugin.cpp diff --git a/velox/benchmarks/QueryBenchmarkBase.cpp b/velox/benchmarks/QueryBenchmarkBase.cpp index 469b6c00542d..113b544008bf 100644 --- a/velox/benchmarks/QueryBenchmarkBase.cpp +++ b/velox/benchmarks/QueryBenchmarkBase.cpp @@ -186,23 +186,23 @@ void QueryBenchmarkBase::initialize() { std::move(configurationValues)); // Create hive connector with config... - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); auto hiveConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector(kHiveConnectorId, properties, ioExecutor_.get()); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); parquet::registerParquetReaderFactory(); dwrf::registerDwrfReaderFactory(); } -std::vector> +std::vector> QueryBenchmarkBase::listSplits( const std::string& path, int32_t numSplitsPerFile, const exec::test::TpchPlan& plan) { - std::vector> result; + std::vector> result; auto temp = HiveConnectorTestBase::makeHiveConnectorSplits( path, numSplitsPerFile, plan.dataFileFormat); for (auto& i : temp) { diff --git a/velox/benchmarks/QueryBenchmarkBase.h b/velox/benchmarks/QueryBenchmarkBase.h index a5a172a8216c..6835c57a6c55 100644 --- a/velox/benchmarks/QueryBenchmarkBase.h +++ b/velox/benchmarks/QueryBenchmarkBase.h @@ -89,7 +89,7 @@ class QueryBenchmarkBase { std::pair, std::vector> run( const exec::test::TpchPlan& tpchPlan); - virtual std::vector> listSplits( + virtual std::vector> listSplits( const std::string& path, int32_t numSplitsPerFile, const exec::test::TpchPlan& plan); diff --git a/velox/buffer/tests/BufferTest.cpp b/velox/buffer/tests/BufferTest.cpp index 9db4963221da..8bc5cbab1741 100644 --- a/velox/buffer/tests/BufferTest.cpp +++ b/velox/buffer/tests/BufferTest.cpp @@ -296,7 +296,7 @@ DEBUG_ONLY_TEST_F(BufferTest, testReallocateFails) { ::memset(buffer->asMutable(), 'a', bufferSize); - common::testutil::TestValue::enable(); + velox::common::testutil::TestValue::enable(); const std::string kErrorMessage = "Expected out of memory exception"; SCOPED_TESTVALUE_SET( diff --git a/velox/common/base/AsyncSource.h b/velox/common/base/AsyncSource.h index 76740dd8681a..8dcbe6ee7008 100644 --- a/velox/common/base/AsyncSource.h +++ b/velox/common/base/AsyncSource.h @@ -63,7 +63,7 @@ class AsyncSource { // Makes an item if it is not already made. To be called on a background // executor. void prepare() { - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::AsyncSource::prepare", this); std::function()> make = nullptr; { @@ -101,7 +101,7 @@ class AsyncSource { // If the item is preparing on the executor, waits for the item and // otherwise makes it on the caller thread. std::unique_ptr move() { - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::AsyncSource::move", this); std::function()> make = nullptr; ContinueFuture wait; diff --git a/velox/common/base/BloomFilter.h b/velox/common/base/BloomFilter.h index 1d23e382834b..93e5af5928b5 100644 --- a/velox/common/base/BloomFilter.h +++ b/velox/common/base/BloomFilter.h @@ -62,7 +62,7 @@ class BloomFilter { } void merge(const char* serialized) { - common::InputByteStream stream(serialized); + velox::common::InputByteStream stream(serialized); auto version = stream.read(); VELOX_USER_CHECK_EQ(kBloomFilterV1, version); auto size = stream.read(); @@ -88,7 +88,7 @@ class BloomFilter { } void serialize(char* output) const { - common::OutputByteStream stream(output); + velox::common::OutputByteStream stream(output); stream.appendOne(kBloomFilterV1); stream.appendOne((int32_t)bits_.size()); for (auto bit : bits_) { diff --git a/velox/common/base/SpillConfig.cpp b/velox/common/base/SpillConfig.cpp index dd428a41ec7b..d419f639e423 100644 --- a/velox/common/base/SpillConfig.cpp +++ b/velox/common/base/SpillConfig.cpp @@ -52,7 +52,7 @@ SpillConfig::SpillConfig( maxSpillLevel(_maxSpillLevel), maxSpillRunRows(_maxSpillRunRows), writerFlushThresholdSize(_writerFlushThresholdSize), - compressionKind(common::stringToCompressionKind(_compressionKind)), + compressionKind(velox::common::stringToCompressionKind(_compressionKind)), prefixSortConfig(_prefixSortConfig), fileCreateConfig(_fileCreateConfig) { VELOX_USER_CHECK_GE( diff --git a/velox/common/base/SpillConfig.h b/velox/common/base/SpillConfig.h index 7f30bc6e614f..d01666872852 100644 --- a/velox/common/base/SpillConfig.h +++ b/velox/common/base/SpillConfig.h @@ -149,7 +149,7 @@ struct SpillConfig { uint64_t writerFlushThresholdSize; /// CompressionKind when spilling, CompressionKind_NONE means no compression. - common::CompressionKind compressionKind; + velox::common::CompressionKind compressionKind; /// Prefix sort config when spilling, enable prefix sort when this config is /// set, otherwise, fallback to timsort. diff --git a/velox/common/base/SpillStats.h b/velox/common/base/SpillStats.h index da9c5fe92184..a58ce2714688 100644 --- a/velox/common/base/SpillStats.h +++ b/velox/common/base/SpillStats.h @@ -113,7 +113,7 @@ struct SpillStats { FOLLY_ALWAYS_INLINE std::ostream& operator<<( std::ostream& o, - const common::SpillStats& stats) { + const velox::common::SpillStats& stats) { return o << stats.toString(); } diff --git a/velox/common/caching/SsdFile.cpp b/velox/common/caching/SsdFile.cpp index fa5c46f22dae..7e7dfc24270a 100644 --- a/velox/common/caching/SsdFile.cpp +++ b/velox/common/caching/SsdFile.cpp @@ -935,13 +935,13 @@ void SsdFile::disableFileCow() { namespace { template -T readNumber(common::FileInputStream* stream) { +T readNumber(velox::common::FileInputStream* stream) { T data; stream->readBytes(reinterpret_cast(&data), sizeof(T)); return data; } -std::string readString(common::FileInputStream* stream, int32_t length) { +std::string readString(velox::common::FileInputStream* stream, int32_t length) { std::string data(length, '\0'); stream->readBytes( reinterpret_cast(const_cast(data.data())), length); @@ -949,7 +949,7 @@ std::string readString(common::FileInputStream* stream, int32_t length) { } template -std::vector readVector(common::FileInputStream* stream, int32_t size) { +std::vector readVector(velox::common::FileInputStream* stream, int32_t size) { std::vector dataVector(size); stream->readBytes( reinterpret_cast(dataVector.data()), size * sizeof(T)); diff --git a/velox/common/compression/tests/CompressionTest.cpp b/velox/common/compression/tests/CompressionTest.cpp index 846d606e4499..c30ff72d2e52 100644 --- a/velox/common/compression/tests/CompressionTest.cpp +++ b/velox/common/compression/tests/CompressionTest.cpp @@ -44,7 +44,7 @@ struct TestParams { std::shared_ptr codecOptions; explicit TestParams( - common::CompressionKind compressionKind, + velox::common::CompressionKind compressionKind, std::shared_ptr codecOptions = kDefaultCodecOptions) : compressionKind(compressionKind), codecOptions(std::move(codecOptions)) {} diff --git a/velox/common/file/File.cpp b/velox/common/file/File.cpp index c1f1e8982737..6295e2489473 100644 --- a/velox/common/file/File.cpp +++ b/velox/common/file/File.cpp @@ -90,7 +90,7 @@ uint64_t ReadFile::preadv( } uint64_t ReadFile::preadv( - folly::Range regions, + folly::Range regions, folly::Range iobufs, filesystems::File::IoStats* stats) const { VELOX_CHECK_EQ(regions.size(), iobufs.size()); @@ -309,7 +309,7 @@ LocalWriteFile::LocalWriteFile( const auto dir = fs::path(path_).parent_path(); if (shouldCreateParentDirectories && !fs::exists(dir)) { VELOX_CHECK( - common::generateFileDirectory(dir.c_str()), + velox::common::generateFileDirectory(dir.c_str()), "Failed to generate file directory"); } diff --git a/velox/common/file/File.h b/velox/common/file/File.h index 18d1c264ca7a..67643bee9077 100644 --- a/velox/common/file/File.h +++ b/velox/common/file/File.h @@ -99,7 +99,7 @@ class ReadFile { // // This method should be thread safe. virtual uint64_t preadv( - folly::Range regions, + folly::Range regions, folly::Range iobufs, filesystems::File::IoStats* stats = nullptr) const; diff --git a/velox/common/hyperloglog/DenseHll.cpp b/velox/common/hyperloglog/DenseHll.cpp index f3a3f54b2d2d..72ba45bd09ec 100644 --- a/velox/common/hyperloglog/DenseHll.cpp +++ b/velox/common/hyperloglog/DenseHll.cpp @@ -232,7 +232,7 @@ int64_t cardinalityImpl(const DenseHllView& hll) { } DenseHllView deserialize(const char* serialized) { - common::InputByteStream stream(serialized); + velox::common::InputByteStream stream(serialized); auto version = stream.read(); VELOX_CHECK_EQ(kPrestoDenseV2, version); @@ -407,7 +407,7 @@ bool DenseHll::canDeserialize(const char* input, int size) { return false; } - common::InputByteStream stream(input); + velox::common::InputByteStream stream(input); auto version = stream.read(); if (kPrestoDenseV2 != version) { return false; @@ -461,7 +461,7 @@ bool DenseHll::canDeserialize(const char* input, int size) { // static int8_t DenseHll::deserializeIndexBitLength(const char* input) { - common::InputByteStream stream(input); + velox::common::InputByteStream stream(input); stream.read(); return stream.read(); } @@ -478,7 +478,7 @@ void DenseHll::serialize(char* output) { // sort overflow arrays to get consistent serialization for equivalent HLLs sortOverflows(); - common::OutputByteStream stream(output); + velox::common::OutputByteStream stream(output); stream.appendOne(kPrestoDenseV2); stream.appendOne(indexBitLength_); stream.appendOne(baseline_); @@ -540,7 +540,7 @@ void DenseHll::mergeWith(const DenseHll& other) { } void DenseHll::mergeWith(const char* serialized) { - common::InputByteStream stream(serialized); + velox::common::InputByteStream stream(serialized); auto version = stream.read(); VELOX_CHECK_EQ(kPrestoDenseV2, version); diff --git a/velox/common/hyperloglog/SparseHll.cpp b/velox/common/hyperloglog/SparseHll.cpp index 27d290dd10ef..ce2c4721c1c9 100644 --- a/velox/common/hyperloglog/SparseHll.cpp +++ b/velox/common/hyperloglog/SparseHll.cpp @@ -58,7 +58,7 @@ int searchIndex( } common::InputByteStream initializeInputStream(const char* serialized) { - common::InputByteStream stream(serialized); + velox::common::InputByteStream stream(serialized); auto version = stream.read(); VELOX_CHECK_EQ(kPrestoSparseV2, version); @@ -111,7 +111,7 @@ int64_t SparseHll::cardinality(const char* serialized) { } void SparseHll::serialize(int8_t indexBitLength, char* output) const { - common::OutputByteStream stream(output); + velox::common::OutputByteStream stream(output); stream.appendOne(kPrestoSparseV2); stream.appendOne(indexBitLength); stream.appendOne(static_cast(entries_.size())); @@ -127,7 +127,7 @@ std::string SparseHll::serializeEmpty(int8_t indexBitLength) { std::string serialized; serialized.resize(kSize); - common::OutputByteStream stream(serialized.data()); + velox::common::OutputByteStream stream(serialized.data()); stream.appendOne(kPrestoSparseV2); stream.appendOne(indexBitLength); stream.appendOne(static_cast(0)); diff --git a/velox/common/hyperloglog/benchmarks/DenseHll.cpp b/velox/common/hyperloglog/benchmarks/DenseHll.cpp index 7233280f1d66..192184c8b03c 100644 --- a/velox/common/hyperloglog/benchmarks/DenseHll.cpp +++ b/velox/common/hyperloglog/benchmarks/DenseHll.cpp @@ -49,7 +49,7 @@ class DenseHllBenchmark { folly::BenchmarkSuspender suspender; HashStringAllocator allocator(pool_); - common::hll::DenseHll hll(hashBits, &allocator); + velox::common::hll::DenseHll hll(hashBits, &allocator); suspender.dismiss(); @@ -61,7 +61,7 @@ class DenseHllBenchmark { private: std::string makeSerializedHll(int hashBits, int32_t step) { HashStringAllocator allocator(pool_); - common::hll::DenseHll hll(hashBits, &allocator); + velox::common::hll::DenseHll hll(hashBits, &allocator); for (int32_t i = 0; i < 1'000'000; ++i) { auto hash = hashOne(i * step); hll.insertHash(hash); @@ -69,7 +69,7 @@ class DenseHllBenchmark { return serialize(hll); } - static std::string serialize(common::hll::DenseHll& denseHll) { + static std::string serialize(velox::common::hll::DenseHll& denseHll) { auto size = denseHll.serializedSize(); std::string serialized; serialized.resize(size); diff --git a/velox/common/time/Timer.cpp b/velox/common/time/Timer.cpp index 78416f03e67e..ccfba63d2d81 100644 --- a/velox/common/time/Timer.cpp +++ b/velox/common/time/Timer.cpp @@ -21,7 +21,7 @@ namespace facebook::velox { using namespace std::chrono; -using common::testutil::ScopedTestTime; +using velox::common::testutil::ScopedTestTime; #ifndef NDEBUG diff --git a/velox/connectors/BeforeConnectorRefactor.puml b/velox/connectors/BeforeConnectorRefactor.puml index e89babb1a73a..f854d85cf1f7 100644 --- a/velox/connectors/BeforeConnectorRefactor.puml +++ b/velox/connectors/BeforeConnectorRefactor.puml @@ -148,4 +148,4 @@ package presto { } Connector <|- SystemConnector -@enduml +@enduml \ No newline at end of file diff --git a/velox/connectors/CMakeLists.txt b/velox/connectors/CMakeLists.txt index 3cc600201f6b..8b1c5318b65c 100644 --- a/velox/connectors/CMakeLists.txt +++ b/velox/connectors/CMakeLists.txt @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -velox_add_library(velox_connector Connector.cpp) +velox_add_library(velox_connector common/Connector.cpp) velox_link_libraries(velox_connector velox_common_config velox_vector) diff --git a/velox/connectors/after_connector_plugin.puml b/velox/connectors/after_connector_plugin.puml index 908a961e160b..d7da60116289 100644 --- a/velox/connectors/after_connector_plugin.puml +++ b/velox/connectors/after_connector_plugin.puml @@ -240,4 +240,4 @@ Connector <|.. SystemConnector PrestoServer --> ConnectorRegistryFunctions : getConnectorFactory(connectorName) PrestoServer --> ConnectorRegistryFunctions : registerConnector -@enduml +@enduml \ No newline at end of file diff --git a/velox/connectors/before_connector_registery.puml b/velox/connectors/before_connector_registery.puml index 3f3c5769d89f..6af5ea1e13ed 100644 --- a/velox/connectors/before_connector_registery.puml +++ b/velox/connectors/before_connector_registery.puml @@ -183,4 +183,4 @@ Connector <|.. SystemConnector PrestoServer --> ConnectorRegistryFunctions : getConnectorFactory(connectorName) PrestoServer --> ConnectorRegistryFunctions : registerConnector -@enduml +@enduml \ No newline at end of file diff --git a/velox/connectors/common/CMakeLists.txt b/velox/connectors/common/CMakeLists.txt new file mode 100644 index 000000000000..399befa5982c --- /dev/null +++ b/velox/connectors/common/CMakeLists.txt @@ -0,0 +1,44 @@ +# Common connector plugin support library +add_library(velox_connector_common SHARED + Connector.cpp + ConnectorNames.h + ConnectorObjectFactory.h + ConnectorObjectFactory.cpp + PluginLoader.cpp +) + +target_include_directories(velox_connector_common + PUBLIC + $ # “include from source” when building + $ # “include from install” when consuming +) + +# velox_connector_common itself needs velox_common and velox_dynamic_library_loader when it’s built, but anything that +# links against velox_connector_common does not automatically inherit those dependencies. +target_link_libraries(velox_connector_common + PRIVATE +# velox_common # core Velox symbols + velox_dynamic_library_loader +) + +# Set the shared‐library version and SOVERSION as needed +set_target_properties(velox_connector_common PROPERTIES + VERSION ${VELOX_VERSION} + SOVERSION ${VELOX_SOVERSION} +) + +# Install the shared library and its headers +install(TARGETS velox_connector_common + ARCHIVE DESTINATION lib #Installs the static library (libvelox_connector_common.a) into the /lib directory. + LIBRARY DESTINATION lib #Installs any shared library (libvelox_connector_common.dylib or .so) into the same /lib directory. + INCLUDES DESTINATION include/velox/connectors/common +) + +install( + FILES + Connector.h + ConnectorNames.h + PluginLoader.h + ConnectorObjectFactory.h + DESTINATION include/velox/connectors/common +) diff --git a/velox/connectors/Connector.cpp b/velox/connectors/common/Connector.cpp similarity index 93% rename from velox/connectors/Connector.cpp rename to velox/connectors/common/Connector.cpp index 6faa280a2e8c..8e0dcab00180 100644 --- a/velox/connectors/Connector.cpp +++ b/velox/connectors/common/Connector.cpp @@ -14,9 +14,12 @@ * limitations under the License. */ -#include "velox/connectors/Connector.h" +#include "velox/connectors/common/Connector.h" + +#include "velox/connectors/common/ConnectorObjectFactory.h" + +namespace facebook::velox::connector::common { -namespace facebook::velox::connector { namespace { std::unordered_map>& connectorFactories() { @@ -25,6 +28,14 @@ connectorFactories() { return factories; } +std::unordered_map>& +connectorObjectFactories() { + static std:: + unordered_map> + factories; + return factories; +} + std::unordered_map>& connectors() { static std::unordered_map> connectors; return connectors; diff --git a/velox/connectors/Connector.h b/velox/connectors/common/Connector.h similarity index 93% rename from velox/connectors/Connector.h rename to velox/connectors/common/Connector.h index 53ea387c468c..23b2a5b70a09 100644 --- a/velox/connectors/Connector.h +++ b/velox/connectors/common/Connector.h @@ -51,7 +51,7 @@ namespace facebook::velox::core { struct IndexLookupCondition; } -namespace facebook::velox::connector { +namespace facebook::velox::connector::common { class DataSource; @@ -155,11 +155,30 @@ class ConnectorInsertTableHandle : public ISerializable { virtual std::string toString() const = 0; - folly::dynamic serialize() const override { - VELOX_NYI(); - } + folly::dynamic serialize() const override; }; +/// An opaque handle describing where a connector should write its output. +class LocationHandle : public ISerializable { + public: + enum class TableType { kNew, kExisting }; + + virtual ~LocationHandle() = default; + + /// The target directory or base URI for the write. + virtual const std::string& targetDirectory() const = 0; + + /// Optionally a subdirectory for intermediate or final files. + virtual const std::string& writeDirectory() const = 0; + + /// Is this a brand‐new table or an existing one to append to? + virtual TableType tableType() const = 0; + + virtual folly::dynamic serialize() const override; +}; + +using LocationHandlePtr = std::shared_ptr; + /// Represents the commit strategy for writing to connector. enum class CommitStrategy { /// No more commit actions are needed. @@ -194,7 +213,7 @@ class DataSink { uint64_t recodeTimeNs{0}; uint64_t compressionTimeNs{0}; - common::SpillStats spillStats; + velox::common::SpillStats spillStats; bool empty() const; @@ -253,7 +272,7 @@ class DataSource { /// applies to. virtual void addDynamicFilter( column_index_t outputChannel, - const std::shared_ptr& filter) = 0; + const std::shared_ptr& filter) = 0; /// Returns the number of input bytes processed so far. virtual uint64_t getCompletedBytes() = 0; @@ -389,8 +408,8 @@ class ConnectorQueryCtx { memory::MemoryPool* operatorPool, memory::MemoryPool* connectorPool, const config::ConfigBase* sessionProperties, - const common::SpillConfig* spillConfig, - common::PrefixSortConfig prefixSortConfig, + const velox::common::SpillConfig* spillConfig, + velox::common::PrefixSortConfig prefixSortConfig, std::unique_ptr expressionEvaluator, cache::AsyncDataCache* cache, const std::string& queryId, @@ -435,11 +454,11 @@ class ConnectorQueryCtx { return sessionProperties_; } - const common::SpillConfig* spillConfig() const { + const velox::common::SpillConfig* spillConfig() const { return spillConfig_; } - const common::PrefixSortConfig& prefixSortConfig() const { + const velox::common::PrefixSortConfig& prefixSortConfig() const { return prefixSortConfig_; } @@ -506,8 +525,8 @@ class ConnectorQueryCtx { memory::MemoryPool* const operatorPool_; memory::MemoryPool* const connectorPool_; const config::ConfigBase* const sessionProperties_; - const common::SpillConfig* const spillConfig_; - const common::PrefixSortConfig prefixSortConfig_; + const velox::common::SpillConfig* const spillConfig_; + const velox::common::PrefixSortConfig prefixSortConfig_; const std::unique_ptr expressionEvaluator_; cache::AsyncDataCache* cache_; const std::string scanId_; @@ -555,7 +574,7 @@ class Connector { const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& columnHandles, ConnectorQueryCtx* connectorQueryCtx) = 0; /// Returns true if addSplit of DataSource can use 'dataSource' from @@ -616,7 +635,7 @@ class Connector { const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& columnHandles, ConnectorQueryCtx* connectorQueryCtx) { VELOX_UNSUPPORTED( "Connector {} does not support index source", connectorId()); @@ -708,4 +727,15 @@ std::shared_ptr getConnector(const std::string& connectorId); const std::unordered_map>& getAllConnectors(); -} // namespace facebook::velox::connector +} // namespace facebook::velox::connector::common + +template <> +struct fmt::formatter< + facebook::velox::connector::common::LocationHandle::TableType> + : formatter { + auto format( + facebook::velox::connector::common::LocationHandle::TableType s, + format_context& ctx) const { + return formatter::format(static_cast(s), ctx); + } +}; diff --git a/velox/connectors/common/ConnectorNames.h b/velox/connectors/common/ConnectorNames.h new file mode 100644 index 000000000000..d36d3f199a4a --- /dev/null +++ b/velox/connectors/common/ConnectorNames.h @@ -0,0 +1,32 @@ +/* +* Copyright (c) Facebook, Inc. and its affiliates. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +#pragma once + +namespace facebook::velox::connector::common { + +// TODO: Add a demo connector plugin + +constexpr const char* kFuzzerConnectorName = "fuzzer"; +constexpr const char* kHiveConnectorName = "hive"; +constexpr const char* kHiveV2ConnectorName = "hive_v2"; +constexpr const char* kTpchConnectorName = "tpch"; + +// Define the Hive ColumnType as strings to avoid direct Hive reference +inline constexpr const char* kColumnTypeRegular = "regular"; +inline constexpr const char* kColumnTypePartition = "partition_key"; +inline constexpr const char* kColumnTypeSynthesized = "synthesized"; + +} \ No newline at end of file diff --git a/velox/connectors/common/ConnectorObjectFactory.cpp b/velox/connectors/common/ConnectorObjectFactory.cpp new file mode 100644 index 000000000000..2ac35c48d3e4 --- /dev/null +++ b/velox/connectors/common/ConnectorObjectFactory.cpp @@ -0,0 +1,21 @@ +/* +* Copyright (c) Facebook, Inc. and its affiliates. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. + */ +#include "velox/connectors/common/ConnectorObjectFactory.h" + + +namespace facebook::velox::connector::common { + +} diff --git a/velox/connectors/common/ConnectorObjectFactory.h b/velox/connectors/common/ConnectorObjectFactory.h new file mode 100644 index 000000000000..fbf6a020017b --- /dev/null +++ b/velox/connectors/common/ConnectorObjectFactory.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include "velox/connectors/common/Connector.h" + +namespace facebook::velox::connector::common { + +class ConnectorObjectFactory { + public: + virtual ~ConnectorObjectFactory() = default; + + virtual std::shared_ptr makeConnectorSplit( + const std::string& connectorId, + const std::string& filePath, + uint64_t start, + uint64_t length, + const folly::dynamic& options = {}) const { + VELOX_FAIL("InsertTableHandle not supported by connector"); + } + + virtual std::unique_ptr makeColumnHandle( + const std::string& connectorId, + const std::string& name, + const TypePtr& type, + const folly::dynamic& options) const { + VELOX_FAIL("InsertTableHandle not supported by connector"); + } + + virtual std::shared_ptr makeTableHandle( + const std::string& connectorId, + const std::string& tableName, + const RowTypePtr& dataColumns = nullptr, + const folly::dynamic& options = {}) const { + VELOX_FAIL("InsertTableHandle not supported by connector"); + } + + virtual std::shared_ptr makeInsertTableHandle( + const std::string& connectorId, + const std::vector& tableColumnNames, + const std::vector& tableColumnTypes, + std::shared_ptr locationHandle, + const std::optional compressionKind, + const folly::dynamic& options = {}) const { + VELOX_FAIL("InsertTableHandle not supported by connector"); + } + + virtual std::shared_ptr makeLocationHandle( + const std::string& connectorId, + std::string targetDirectory, +// const RowTypePtr& dataColumns = nullptr, + std::optional writeDirectory = std::nullopt, + LocationHandle::TableType tableType = + LocationHandle::TableType::kNew) const { + VELOX_FAIL("InsertTableHandle not supported by connector"); + } +}; +// +///// Registry for ConnectorObjectFactory implementations. +// class ConnectorObjectFactoryRegistry { +// public: +// static ConnectorObjectFactoryRegistry& instance() { +// static ConnectorObjectFactoryRegistry registryInstance; +// return registryInstance; +// } +// +// void registerFactory( +// const std::string& connectorName, +// std::unique_ptr factory) { +// VELOX_CHECK( +// factories_.emplace(connectorName, std::move(factory)).second, +// "Factory for connector '{}' already registered", +// name); +// } +// +// const ConnectorObjectFactory& factoryFor( +// const std::string& connectorName) const { +// auto it = factories_.find(connectorName); +// VELOX_CHECK( +// it != factories_.end(), +// "No factory registered for connector '{}'", +// connectorName); +// return *it->second; +// } +// +// private: +// std::unordered_map> +// factories_; +// }; + +} // namespace facebook::velox::connector::common diff --git a/velox/connectors/common/PluginLoader.cpp b/velox/connectors/common/PluginLoader.cpp new file mode 100644 index 000000000000..4dac547140ae --- /dev/null +++ b/velox/connectors/common/PluginLoader.cpp @@ -0,0 +1,33 @@ +/* +* Copyright (c) Facebook, Inc. and its affiliates. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +#pragma once + +#include "velox/connectors/common/PluginLoader.h" + +#include +#include + +#include "velox/common/dynamic_registry/DynamicLibraryLoader.h" + +namespace facebook::velox::connector::common { + +void loadConnectorPlugins(const std::vector& pluginPaths) { + for (auto& path : pluginPaths) { + facebook::velox::loadDynamicLibrary(path); + } +} + +} \ No newline at end of file diff --git a/velox/connectors/common/PluginLoader.h b/velox/connectors/common/PluginLoader.h new file mode 100644 index 000000000000..14d25d302389 --- /dev/null +++ b/velox/connectors/common/PluginLoader.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +//#include "velox/common/dynamic_registry/DynamicLibraryLoader.h" + +namespace facebook::velox::connector::common { + + void loadConnectorPlugins(const std::vector& pluginPaths); + +} // namespace facebook::velox::connector::common diff --git a/velox/connectors/fuzzer/FuzzerConnector.cpp b/velox/connectors/fuzzer/FuzzerConnector.cpp index 53f0c1843664..c74f1a854e40 100644 --- a/velox/connectors/fuzzer/FuzzerConnector.cpp +++ b/velox/connectors/fuzzer/FuzzerConnector.cpp @@ -21,7 +21,7 @@ namespace facebook::velox::connector::fuzzer { FuzzerDataSource::FuzzerDataSource( const std::shared_ptr& outputType, - const std::shared_ptr& tableHandle, + const std::shared_ptr& tableHandle, velox::memory::MemoryPool* pool) : outputType_(outputType), pool_(pool) { auto fuzzerTableHandle = @@ -34,7 +34,7 @@ FuzzerDataSource::FuzzerDataSource( fuzzerTableHandle->fuzzerOptions, pool_, fuzzerTableHandle->fuzzerSeed); } -void FuzzerDataSource::addSplit(std::shared_ptr split) { +void FuzzerDataSource::addSplit(std::shared_ptr split) { VELOX_CHECK_EQ( currentSplit_, nullptr, diff --git a/velox/connectors/fuzzer/FuzzerConnector.h b/velox/connectors/fuzzer/FuzzerConnector.h index 33cc1f819fc6..3b354daf499a 100644 --- a/velox/connectors/fuzzer/FuzzerConnector.h +++ b/velox/connectors/fuzzer/FuzzerConnector.h @@ -16,7 +16,7 @@ #pragma once #include "velox/common/config/Config.h" -#include "velox/connectors/Connector.h" +#include "velox/connectors/common/Connector.h" #include "velox/connectors/fuzzer/FuzzerConnectorSplit.h" #include "velox/vector/fuzzer/VectorFuzzer.h" @@ -32,13 +32,13 @@ namespace facebook::velox::connector::fuzzer { /// FuzzerConnectorSplit lets clients specify how many rows are expected to be /// generated. -class FuzzerTableHandle : public ConnectorTableHandle { +class FuzzerTableHandle : public connector::common::ConnectorTableHandle { public: explicit FuzzerTableHandle( std::string connectorId, VectorFuzzer::Options options, size_t fuzzerSeed = 0) - : ConnectorTableHandle(std::move(connectorId)), + : connector::common::ConnectorTableHandle(std::move(connectorId)), fuzzerOptions(options), fuzzerSeed(fuzzerSeed) {} @@ -52,18 +52,18 @@ class FuzzerTableHandle : public ConnectorTableHandle { size_t fuzzerSeed; }; -class FuzzerDataSource : public DataSource { +class FuzzerDataSource : public connector::common::DataSource { public: FuzzerDataSource( const std::shared_ptr& outputType, - const std::shared_ptr& tableHandle, + const std::shared_ptr& tableHandle, velox::memory::MemoryPool* pool); - void addSplit(std::shared_ptr split) override; + void addSplit(std::shared_ptr split) override; void addDynamicFilter( column_index_t /*outputChannel*/, - const std::shared_ptr& /*filter*/) override { + const std::shared_ptr& /*filter*/) override { VELOX_NYI("Dynamic filters not supported by FuzzerConnector."); } @@ -100,45 +100,45 @@ class FuzzerDataSource : public DataSource { memory::MemoryPool* pool_; }; -class FuzzerConnector final : public Connector { +class FuzzerConnector final : public connector::common::Connector { public: FuzzerConnector( const std::string& id, std::shared_ptr config, folly::Executor* /*executor*/) - : Connector(id) {} + : connector::common::Connector(id) {} - std::unique_ptr createDataSource( + std::unique_ptr createDataSource( const std::shared_ptr& outputType, - const std::shared_ptr& tableHandle, + const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& /*columnHandles*/, - ConnectorQueryCtx* connectorQueryCtx) override final { + std::shared_ptr>& /*columnHandles*/, + connector::common::ConnectorQueryCtx* connectorQueryCtx) override final { return std::make_unique( outputType, tableHandle, connectorQueryCtx->memoryPool()); } - std::unique_ptr createDataSink( + std::unique_ptr createDataSink( RowTypePtr /*inputType*/, std::shared_ptr< - ConnectorInsertTableHandle> /*connectorInsertTableHandle*/, - ConnectorQueryCtx* /*connectorQueryCtx*/, - CommitStrategy /*commitStrategy*/) override final { + connector::common::ConnectorInsertTableHandle> /*connectorInsertTableHandle*/, + connector::common::ConnectorQueryCtx* /*connectorQueryCtx*/, + connector::common::CommitStrategy /*commitStrategy*/) override final { VELOX_NYI("FuzzerConnector does not support data sink."); } }; -class FuzzerConnectorFactory : public ConnectorFactory { +class FuzzerConnectorFactory : public connector::common::ConnectorFactory { public: static constexpr const char* kFuzzerConnectorName{"fuzzer"}; - FuzzerConnectorFactory() : ConnectorFactory(kFuzzerConnectorName) {} + FuzzerConnectorFactory() : connector::common::ConnectorFactory(kFuzzerConnectorName) {} explicit FuzzerConnectorFactory(const char* connectorName) - : ConnectorFactory(connectorName) {} + : connector::common::ConnectorFactory(connectorName) {} - std::shared_ptr newConnector( + std::shared_ptr newConnector( const std::string& id, std::shared_ptr config, folly::Executor* ioExecutor = nullptr, diff --git a/velox/connectors/fuzzer/FuzzerConnectorSplit.h b/velox/connectors/fuzzer/FuzzerConnectorSplit.h index 11080aa4382e..f9ff36c0a59c 100644 --- a/velox/connectors/fuzzer/FuzzerConnectorSplit.h +++ b/velox/connectors/fuzzer/FuzzerConnectorSplit.h @@ -15,13 +15,13 @@ */ #pragma once -#include "velox/connectors/Connector.h" +#include "../common/Connector.h" namespace facebook::velox::connector::fuzzer { -struct FuzzerConnectorSplit : public connector::ConnectorSplit { +struct FuzzerConnectorSplit : public connector::common::ConnectorSplit { explicit FuzzerConnectorSplit(const std::string& connectorId, size_t numRows) - : ConnectorSplit(connectorId), numRows(numRows) {} + : connector::common::ConnectorSplit(connectorId), numRows(numRows) {} // Row many rows to generate. size_t numRows; diff --git a/velox/connectors/fuzzer/tests/FuzzerConnectorTestBase.h b/velox/connectors/fuzzer/tests/FuzzerConnectorTestBase.h index 971b54dbe9b8..1be47edd1a75 100644 --- a/velox/connectors/fuzzer/tests/FuzzerConnectorTestBase.h +++ b/velox/connectors/fuzzer/tests/FuzzerConnectorTestBase.h @@ -26,19 +26,19 @@ class FuzzerConnectorTestBase : public exec::test::OperatorTestBase { void SetUp() override { OperatorTestBase::SetUp(); - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); std::shared_ptr config; auto fuzzerConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::fuzzer::FuzzerConnectorFactory::kFuzzerConnectorName) ->newConnector(kFuzzerConnectorId, config); - connector::registerConnector(fuzzerConnector); + connector::common::registerConnector(fuzzerConnector); } void TearDown() override { - connector::unregisterConnector(kFuzzerConnectorId); - connector::unregisterConnectorFactory( + connector::common::unregisterConnector(kFuzzerConnectorId); + connector::common::unregisterConnectorFactory( connector::fuzzer::FuzzerConnectorFactory::kFuzzerConnectorName); OperatorTestBase::TearDown(); } diff --git a/velox/connectors/hive/CMakeLists.txt b/velox/connectors/hive/CMakeLists.txt index 6c8aa05487b4..5a3e1d359888 100644 --- a/velox/connectors/hive/CMakeLists.txt +++ b/velox/connectors/hive/CMakeLists.txt @@ -27,6 +27,7 @@ velox_add_library( HiveConnectorSplit.cpp HiveDataSink.cpp HiveDataSource.cpp + HiveObjectFactory.cpp HivePartitionUtil.cpp PartitionIdGenerator.cpp SplitReader.cpp @@ -51,3 +52,40 @@ endif() if(${VELOX_ENABLE_BENCHMARKS}) add_subdirectory(benchmarks) endif() + +add_library(velox_hive_connector_static STATIC + $ + $ + $ +) +target_link_libraries(velox_hive_connector_static + PUBLIC + velox_core + velox_exception + velox_common_io + velox_connector + velox_dwio_catalog_fbhive + velox_exec + velox_hive_iceberg_splitreader +) +install(TARGETS velox_hive_connector_static + ARCHIVE DESTINATION lib + LIBRARY DESTINATION lib +) + +# --- Plugin definition: upstream Hive as a .so plugin --- +add_library(velox_hive_connector_plugin SHARED + HiveObjectFactory.h + HivePlugin.cpp +) +target_link_libraries(velox_hive_connector_plugin + PRIVATE + velox_connector_common + velox_connectors_hive +) +set_target_properties(velox_hive_connector_plugin PROPERTIES + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/plugins +) +install(TARGETS velox_hive_connector_plugin + LIBRARY DESTINATION plugins +) diff --git a/velox/connectors/hive/HiveConnector.cpp b/velox/connectors/hive/HiveConnector.cpp index 176c1ac42a72..1fb1aab3b410 100644 --- a/velox/connectors/hive/HiveConnector.cpp +++ b/velox/connectors/hive/HiveConnector.cpp @@ -43,7 +43,7 @@ HiveConnector::HiveConnector( const std::string& id, std::shared_ptr config, folly::Executor* executor) - : Connector(id), + : connector::common::Connector(id), hiveConfig_(std::make_shared(config)), fileHandleFactory_( hiveConfig_->isFileHandleCacheEnabled() @@ -70,13 +70,13 @@ HiveConnector::HiveConnector( } } -std::unique_ptr HiveConnector::createDataSource( +std::unique_ptr HiveConnector::createDataSource( const RowTypePtr& outputType, - const std::shared_ptr& tableHandle, + const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, - ConnectorQueryCtx* connectorQueryCtx) { + std::shared_ptr>& columnHandles, + connector::common::ConnectorQueryCtx* connectorQueryCtx) { return std::make_unique( outputType, tableHandle, @@ -87,11 +87,11 @@ std::unique_ptr HiveConnector::createDataSource( hiveConfig_); } -std::unique_ptr HiveConnector::createDataSink( +std::unique_ptr HiveConnector::createDataSink( RowTypePtr inputType, - std::shared_ptr connectorInsertTableHandle, - ConnectorQueryCtx* connectorQueryCtx, - CommitStrategy commitStrategy) { + std::shared_ptr connectorInsertTableHandle, + connector::common::ConnectorQueryCtx* connectorQueryCtx, + connector::common::CommitStrategy commitStrategy) { auto hiveInsertHandle = std::dynamic_pointer_cast( connectorInsertTableHandle); VELOX_CHECK_NOT_NULL( diff --git a/velox/connectors/hive/HiveConnector.h b/velox/connectors/hive/HiveConnector.h index 546ecd5d2733..010f2fdebdc3 100644 --- a/velox/connectors/hive/HiveConnector.h +++ b/velox/connectors/hive/HiveConnector.h @@ -15,7 +15,7 @@ */ #pragma once -#include "velox/connectors/Connector.h" +#include "velox/connectors/common/Connector.h" #include "velox/connectors/hive/FileHandle.h" #include "velox/connectors/hive/HiveConfig.h" #include "velox/core/PlanNode.h" @@ -27,7 +27,7 @@ class DataSource; namespace facebook::velox::connector::hive { -class HiveConnector : public Connector { +class HiveConnector : public connector::common::Connector { public: HiveConnector( const std::string& id, @@ -43,28 +43,28 @@ class HiveConnector : public Connector { return true; } - ConnectorMetadata* metadata() const override { + connector::common::ConnectorMetadata* metadata() const override { VELOX_CHECK_NOT_NULL(metadata_); return metadata_.get(); } - std::unique_ptr createDataSource( + std::unique_ptr createDataSource( const RowTypePtr& outputType, - const std::shared_ptr& tableHandle, + const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, - ConnectorQueryCtx* connectorQueryCtx) override; + std::shared_ptr>& columnHandles, + connector::common::ConnectorQueryCtx* connectorQueryCtx) override; bool supportsSplitPreload() override { return true; } - std::unique_ptr createDataSink( + std::unique_ptr createDataSink( RowTypePtr inputType, - std::shared_ptr connectorInsertTableHandle, - ConnectorQueryCtx* connectorQueryCtx, - CommitStrategy commitStrategy) override; + std::shared_ptr connectorInsertTableHandle, + connector::common::ConnectorQueryCtx* connectorQueryCtx, + connector::common::CommitStrategy commitStrategy) override; folly::Executor* executor() const override { return executor_; @@ -84,19 +84,19 @@ class HiveConnector : public Connector { const std::shared_ptr hiveConfig_; FileHandleFactory fileHandleFactory_; folly::Executor* executor_; - std::shared_ptr metadata_; + std::shared_ptr metadata_; }; -class HiveConnectorFactory : public ConnectorFactory { +class HiveConnectorFactory : public connector::common::ConnectorFactory { public: static constexpr const char* kHiveConnectorName = "hive"; - HiveConnectorFactory() : ConnectorFactory(kHiveConnectorName) {} + HiveConnectorFactory() : connector::common::ConnectorFactory(kHiveConnectorName) {} explicit HiveConnectorFactory(const char* connectorName) - : ConnectorFactory(connectorName) {} + : connector::common::ConnectorFactory(connectorName) {} - std::shared_ptr newConnector( + std::shared_ptr newConnector( const std::string& id, std::shared_ptr config, folly::Executor* ioExecutor = nullptr, @@ -158,15 +158,15 @@ void registerHivePartitionFunctionSerDe(); /// Hook for connecting metadata functions to a HiveConnector. Each registered /// factory is called after initializing a HiveConnector until one of these -/// returns a ConnectorMetadata instance. +/// returns a connector::common::ConnectorMetadata instance. class HiveConnectorMetadataFactory { public: virtual ~HiveConnectorMetadataFactory() = default; - /// Returns a ConnectorMetadata to complete'hiveConnector' if 'this' + /// Returns a connector::common::ConnectorMetadata to complete'hiveConnector' if 'this' /// recognizes a data source, e.g. local file system or remote metadata /// service associated to configs in 'hiveConnector'. - virtual std::shared_ptr create( + virtual std::shared_ptr create( HiveConnector* connector) = 0; }; diff --git a/velox/connectors/hive/HiveConnectorSplit.h b/velox/connectors/hive/HiveConnectorSplit.h index 710d021eccba..55dd8efb3189 100644 --- a/velox/connectors/hive/HiveConnectorSplit.h +++ b/velox/connectors/hive/HiveConnectorSplit.h @@ -17,7 +17,7 @@ #include #include -#include "velox/connectors/Connector.h" +#include "velox/connectors/common/Connector.h" #include "velox/connectors/hive/FileProperties.h" #include "velox/connectors/hive/TableHandle.h" #include "velox/dwio/common/Options.h" @@ -42,7 +42,7 @@ struct RowIdProperties { std::string tableGuid; }; -struct HiveConnectorSplit : public connector::ConnectorSplit { +struct HiveConnectorSplit : public connector::common::ConnectorSplit { const std::string filePath; dwio::common::FileFormat fileFormat; const uint64_t start; @@ -95,7 +95,7 @@ struct HiveConnectorSplit : public connector::ConnectorSplit { std::optional _rowIdProperties = std::nullopt, const std::optional& _bucketConversion = std::nullopt) - : ConnectorSplit(connectorId, splitWeight, cacheable), + : connector::common::ConnectorSplit(connectorId, splitWeight, cacheable), filePath(_filePath), fileFormat(_fileFormat), start(_start), diff --git a/velox/connectors/hive/HiveConnectorUtil.cpp b/velox/connectors/hive/HiveConnectorUtil.cpp index cd263863f922..b6e772d8d951 100644 --- a/velox/connectors/hive/HiveConnectorUtil.cpp +++ b/velox/connectors/hive/HiveConnectorUtil.cpp @@ -27,7 +27,7 @@ namespace facebook::velox::connector::hive { namespace { struct SubfieldSpec { - const common::Subfield* subfield; + const velox::common::Subfield* subfield; bool filterOnly; }; @@ -44,9 +44,9 @@ void deduplicate(std::vector& values) { // generates a[9223372036854775807]; for anything smaller than // -9223372036854775808 it generates a[-9223372036854775808]. template -std::unique_ptr makeFloatingPointMapKeyFilter( +std::unique_ptr makeFloatingPointMapKeyFilter( const std::vector& subscripts) { - std::vector> filters; + std::vector> filters; for (auto subscript : subscripts) { T lower = subscript; T upper = subscript; @@ -73,7 +73,7 @@ std::unique_ptr makeFloatingPointMapKeyFilter( if (lowerUnbounded && upperUnbounded) { continue; } - filters.push_back(std::make_unique>( + filters.push_back(std::make_unique>( lower, lowerUnbounded, lowerExclusive, @@ -85,7 +85,7 @@ std::unique_ptr makeFloatingPointMapKeyFilter( if (filters.size() == 1) { return std::move(filters[0]); } - return std::make_unique(std::move(filters), false); + return std::make_unique(std::move(filters), false); } // Recursively add subfields to scan spec. @@ -94,7 +94,7 @@ void addSubfields( std::vector& subfields, int level, memory::MemoryPool* pool, - common::ScanSpec& spec) { + velox::common::ScanSpec& spec) { int newSize = 0; for (int i = 0; i < subfields.size(); ++i) { if (level < subfields[i].subfield->path().size()) { @@ -111,7 +111,7 @@ void addSubfields( for (auto& subfield : subfields) { auto* element = subfield.subfield->path()[level].get(); auto* nestedField = - dynamic_cast(element); + dynamic_cast(element); VELOX_CHECK( nestedField, "Unsupported for row subfields pruning: {}", @@ -150,12 +150,12 @@ void addSubfields( std::vector longSubscripts; for (auto& subfield : subfields) { auto* element = subfield.subfield->path()[level].get(); - if (dynamic_cast(element)) { + if (dynamic_cast(element)) { return; } if (stringKey) { auto* subscript = - dynamic_cast(element); + dynamic_cast(element); VELOX_CHECK( subscript, "Unsupported for string map pruning: {}", @@ -163,7 +163,7 @@ void addSubfields( stringSubscripts.push_back(subscript->index()); } else { auto* subscript = - dynamic_cast(element); + dynamic_cast(element); VELOX_CHECK( subscript, "Unsupported for long map pruning: {}", @@ -171,10 +171,10 @@ void addSubfields( longSubscripts.push_back(subscript->index()); } } - std::unique_ptr filter; + std::unique_ptr filter; if (stringKey) { deduplicate(stringSubscripts); - filter = std::make_unique(stringSubscripts, false); + filter = std::make_unique(stringSubscripts, false); spec.setFlatMapFeatureSelection(std::move(stringSubscripts)); } else { deduplicate(longSubscripts); @@ -183,7 +183,7 @@ void addSubfields( } else if (keyType->isDouble()) { filter = makeFloatingPointMapKeyFilter(longSubscripts); } else { - filter = common::createBigintValues(longSubscripts, false); + filter = velox::common::createBigintValues(longSubscripts, false); } std::vector features; for (auto num : longSubscripts) { @@ -208,11 +208,11 @@ void addSubfields( long maxIndex = -1; for (auto& subfield : subfields) { auto* element = subfield.subfield->path()[level].get(); - if (dynamic_cast(element)) { + if (dynamic_cast(element)) { return; } auto* subscript = - dynamic_cast(element); + dynamic_cast(element); VELOX_CHECK( subscript, "Unsupported for array pruning: {}", @@ -255,9 +255,9 @@ bool isSpecialColumn( } // namespace -const std::string& getColumnName(const common::Subfield& subfield) { +const std::string& getColumnName(const velox::common::Subfield& subfield) { VELOX_CHECK_GT(subfield.path().size(), 0); - auto* field = dynamic_cast( + auto* field = dynamic_cast( subfield.path()[0].get()); VELOX_CHECK_NOT_NULL(field); return field->name(); @@ -288,7 +288,7 @@ void checkColumnNameLowerCase(const std::shared_ptr& type) { } void checkColumnNameLowerCase( - const common::SubfieldFilters& filters, + const velox::common::SubfieldFilters& filters, const std::unordered_map>& infoColumns) { for (const auto& filterIt : filters) { @@ -300,7 +300,7 @@ void checkColumnNameLowerCase( for (int i = 0; i < path.size(); ++i) { auto* nestedField = - dynamic_cast(path[i].get()); + dynamic_cast(path[i].get()); if (nestedField == nullptr) { continue; } @@ -325,12 +325,12 @@ namespace { void processFieldSpec( const RowTypePtr& dataColumns, const TypePtr& outputType, - common::ScanSpec& fieldSpec) { - fieldSpec.visit(*outputType, [](const Type& type, common::ScanSpec& spec) { + velox::common::ScanSpec& fieldSpec) { + fieldSpec.visit(*outputType, [](const Type& type, velox::common::ScanSpec& spec) { if (type.isMap() && !spec.isConstant()) { - auto* keys = spec.childByName(common::ScanSpec::kMapKeysFieldName); + auto* keys = spec.childByName(velox::common::ScanSpec::kMapKeysFieldName); VELOX_CHECK_NOT_NULL(keys); - keys->addFilter(common::IsNotNull()); + keys->addFilter(velox::common::IsNotNull()); } }); if (dataColumns) { @@ -345,11 +345,11 @@ void processFieldSpec( } // namespace -std::shared_ptr makeScanSpec( +std::shared_ptr makeScanSpec( const RowTypePtr& rowType, - const folly::F14FastMap>& + const folly::F14FastMap>& outputSubfields, - const common::SubfieldFilters& filters, + const velox::common::SubfieldFilters& filters, const RowTypePtr& dataColumns, const std::unordered_map>& partitionKeys, @@ -358,8 +358,8 @@ std::shared_ptr makeScanSpec( const SpecialColumnNames& specialColumns, bool disableStatsBasedFilterReorder, memory::MemoryPool* pool) { - auto spec = std::make_shared("root"); - folly::F14FastMap> + auto spec = std::make_shared("root"); + folly::F14FastMap> filterSubfields; std::vector subfieldSpecs; for (auto& [subfield, _] : filters) { @@ -379,16 +379,16 @@ std::shared_ptr makeScanSpec( if (isSpecialColumn(name, specialColumns.rowIndex)) { VELOX_CHECK(type->isBigint()); auto* fieldSpec = spec->addField(name, i); - fieldSpec->setColumnType(common::ScanSpec::ColumnType::kRowIndex); + fieldSpec->setColumnType(velox::common::ScanSpec::ColumnType::kRowIndex); continue; } if (isSpecialColumn(name, specialColumns.rowId)) { VELOX_CHECK(type->isRow() && type->size() == 5); auto& rowIdType = type->asRow(); auto* fieldSpec = spec->addFieldRecursively(name, rowIdType, i); - fieldSpec->setColumnType(common::ScanSpec::ColumnType::kComposite); + fieldSpec->setColumnType(velox::common::ScanSpec::ColumnType::kComposite); fieldSpec->childByName(rowIdType.nameOf(0)) - ->setColumnType(common::ScanSpec::ColumnType::kRowIndex); + ->setColumnType(velox::common::ScanSpec::ColumnType::kRowIndex); continue; } auto it = outputSubfields.find(name); @@ -526,7 +526,7 @@ std::unique_ptr parseSerdeParameters( void configureReaderOptions( const std::shared_ptr& hiveConfig, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, const std::shared_ptr& hiveTableHandle, const std::shared_ptr& hiveSplit, dwio::common::ReaderOptions& readerOptions) { @@ -541,7 +541,7 @@ void configureReaderOptions( void configureReaderOptions( const std::shared_ptr& hiveConfig, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, const RowTypePtr& fileSchema, const std::shared_ptr& hiveSplit, const std::unordered_map& tableParameters, @@ -607,8 +607,8 @@ void configureReaderOptions( void configureRowReaderOptions( const std::unordered_map& tableParameters, - const std::shared_ptr& scanSpec, - std::shared_ptr metadataFilter, + const std::shared_ptr& scanSpec, + std::shared_ptr metadataFilter, const RowTypePtr& rowType, const std::shared_ptr& hiveSplit, const std::shared_ptr& hiveConfig, @@ -636,7 +636,7 @@ bool applyPartitionFilter( const TypePtr& type, const std::string& partitionValue, bool isPartitionDateDaysSinceEpoch, - common::Filter* filter, + velox::common::Filter* filter, bool asLocalTime) { if (type->isDate()) { int32_t result = 0; @@ -685,7 +685,7 @@ bool applyPartitionFilter( } // namespace bool testFilters( - const common::ScanSpec* scanSpec, + const velox::common::ScanSpec* scanSpec, const dwio::common::Reader* reader, const std::string& filePath, const std::unordered_map>& @@ -749,7 +749,7 @@ bool testFilters( std::unique_ptr createBufferedInput( const FileHandle& fileHandle, const dwio::common::ReaderOptions& readerOpts, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, std::shared_ptr ioStats, std::shared_ptr fsStats, folly::Executor* executor) { @@ -759,7 +759,7 @@ std::unique_ptr createBufferedInput( dwio::common::MetricsLog::voidLog(), fileHandle.uuid.id(), connectorQueryCtx->cache(), - Connector::getTracker( + connector::common::Connector::getTracker( connectorQueryCtx->scanId(), readerOpts.loadQuantum()), fileHandle.groupId.id(), ioStats, @@ -771,7 +771,7 @@ std::unique_ptr createBufferedInput( fileHandle.file, dwio::common::MetricsLog::voidLog(), fileHandle.uuid.id(), - Connector::getTracker( + connector::common::Connector::getTracker( connectorQueryCtx->scanId(), readerOpts.loadQuantum()), fileHandle.groupId.id(), std::move(ioStats), @@ -853,15 +853,15 @@ core::TypedExprPtr extractFiltersFromRemainingFilter( const core::TypedExprPtr& expr, core::ExpressionEvaluator* evaluator, bool negated, - common::SubfieldFilters& filters, + velox::common::SubfieldFilters& filters, double& sampleRate) { auto* call = dynamic_cast(expr.get()); if (call == nullptr) { return expr; } - common::Filter* oldFilter = nullptr; + velox::common::Filter* oldFilter = nullptr; try { - common::Subfield subfield; + velox::common::Subfield subfield; if (auto filter = exec::ExprToSubfieldFilterParser::getInstance() ->leafCallToSubfieldFilter( *call, subfield, evaluator, negated)) { diff --git a/velox/connectors/hive/HiveConnectorUtil.h b/velox/connectors/hive/HiveConnectorUtil.h index 0209a50848e2..f9384cfb7620 100644 --- a/velox/connectors/hive/HiveConnectorUtil.h +++ b/velox/connectors/hive/HiveConnectorUtil.h @@ -18,7 +18,7 @@ #include #include -#include "velox/connectors/Connector.h" +#include "velox/connectors/common/Connector.h" #include "velox/connectors/hive/FileHandle.h" #include "velox/dwio/common/BufferedInput.h" #include "velox/dwio/common/Reader.h" @@ -30,12 +30,12 @@ class HiveTableHandle; class HiveConfig; struct HiveConnectorSplit; -const std::string& getColumnName(const common::Subfield& subfield); +const std::string& getColumnName(const velox::common::Subfield& subfield); void checkColumnNameLowerCase(const std::shared_ptr& type); void checkColumnNameLowerCase( - const common::SubfieldFilters& filters, + const velox::common::SubfieldFilters& filters, const std::unordered_map>& infoColumns); @@ -46,11 +46,11 @@ struct SpecialColumnNames { std::optional rowId; }; -std::shared_ptr makeScanSpec( +std::shared_ptr makeScanSpec( const RowTypePtr& rowType, - const folly::F14FastMap>& + const folly::F14FastMap>& outputSubfields, - const common::SubfieldFilters& filters, + const velox::common::SubfieldFilters& filters, const RowTypePtr& dataColumns, const std::unordered_map>& partitionKeys, @@ -62,14 +62,14 @@ std::shared_ptr makeScanSpec( void configureReaderOptions( const std::shared_ptr& config, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, const std::shared_ptr& hiveTableHandle, const std::shared_ptr& hiveSplit, dwio::common::ReaderOptions& readerOptions); void configureReaderOptions( const std::shared_ptr& hiveConfig, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, const RowTypePtr& fileSchema, const std::shared_ptr& hiveSplit, const std::unordered_map& tableParameters, @@ -77,8 +77,8 @@ void configureReaderOptions( void configureRowReaderOptions( const std::unordered_map& tableParameters, - const std::shared_ptr& scanSpec, - std::shared_ptr metadataFilter, + const std::shared_ptr& scanSpec, + std::shared_ptr metadataFilter, const RowTypePtr& rowType, const std::shared_ptr& hiveSplit, const std::shared_ptr& hiveConfig, @@ -86,7 +86,7 @@ void configureRowReaderOptions( dwio::common::RowReaderOptions& rowReaderOptions); bool testFilters( - const common::ScanSpec* scanSpec, + const velox::common::ScanSpec* scanSpec, const dwio::common::Reader* reader, const std::string& filePath, const std::unordered_map>& @@ -98,7 +98,7 @@ bool testFilters( std::unique_ptr createBufferedInput( const FileHandle& fileHandle, const dwio::common::ReaderOptions& readerOpts, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, std::shared_ptr ioStats, std::shared_ptr fsStats, folly::Executor* executor); @@ -107,7 +107,7 @@ core::TypedExprPtr extractFiltersFromRemainingFilter( const core::TypedExprPtr& expr, core::ExpressionEvaluator* evaluator, bool negated, - common::SubfieldFilters& filters, + velox::common::SubfieldFilters& filters, double& sampleRate); } // namespace facebook::velox::connector::hive diff --git a/velox/connectors/hive/HiveDataSink.cpp b/velox/connectors/hive/HiveDataSink.cpp index 36ec297220d9..9b70d4fe8eda 100644 --- a/velox/connectors/hive/HiveDataSink.cpp +++ b/velox/connectors/hive/HiveDataSink.cpp @@ -123,10 +123,10 @@ std::string makeUuid() { return boost::lexical_cast(boost::uuids::random_generator()()); } -std::unordered_map tableTypeNames() { +std::unordered_map tableTypeNames() { return { - {LocationHandle::TableType::kNew, "kNew"}, - {LocationHandle::TableType::kExisting, "kExisting"}, + {connector::common::LocationHandle::TableType::kNew, "kNew"}, + {connector::common::LocationHandle::TableType::kExisting, "kExisting"}, }; } @@ -226,13 +226,13 @@ std::string HiveWriterId::toString() const { return "unpart"; } -const std::string LocationHandle::tableTypeName( - LocationHandle::TableType type) { +const std::string HiveLocationHandle::tableTypeName( + connector::common::LocationHandle::TableType type) { static const auto tableTypes = tableTypeNames(); return tableTypes.at(type); } -LocationHandle::TableType LocationHandle::tableTypeFromName( +LocationHandle::TableType HiveLocationHandle::tableTypeFromName( const std::string& name) { static const auto nameTableTypes = invertMap(tableTypeNames()); return nameTableTypes.at(name); @@ -368,8 +368,8 @@ std::string HiveBucketProperty::toString() const { HiveDataSink::HiveDataSink( RowTypePtr inputType, std::shared_ptr insertTableHandle, - const ConnectorQueryCtx* connectorQueryCtx, - CommitStrategy commitStrategy, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, + connector::common::CommitStrategy commitStrategy, const std::shared_ptr& hiveConfig) : HiveDataSink( inputType, @@ -387,8 +387,8 @@ HiveDataSink::HiveDataSink( HiveDataSink::HiveDataSink( RowTypePtr inputType, std::shared_ptr insertTableHandle, - const ConnectorQueryCtx* connectorQueryCtx, - CommitStrategy commitStrategy, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, + connector::common::CommitStrategy commitStrategy, const std::shared_ptr& hiveConfig, uint32_t bucketCount, std::unique_ptr bucketFunction) @@ -427,8 +427,8 @@ HiveDataSink::HiveDataSink( bucketCount_, maxBucketCount(), "bucketCount exceeds the limit"); } VELOX_USER_CHECK( - (commitStrategy_ == CommitStrategy::kNoCommit) || - (commitStrategy_ == CommitStrategy::kTaskCommit), + (commitStrategy_ == connector::common::CommitStrategy::kNoCommit) || + (commitStrategy_ == connector::common::CommitStrategy::kTaskCommit), "Unsupported commit strategy: {}", commitStrategyToString(commitStrategy_)); @@ -754,11 +754,12 @@ uint32_t HiveDataSink::appendWriter(const HiveWriterId& id) { if (sortWrite()) { sortPool = createSortPool(writerPool); } - writerInfo_.emplace_back(std::make_shared( - std::move(writerParameters), - std::move(writerPool), - std::move(sinkPool), - std::move(sortPool))); + writerInfo_.emplace_back( + std::make_shared( + std::move(writerParameters), + std::move(writerPool), + std::move(sinkPool), + std::move(sortPool))); ioStats_.emplace_back(std::make_shared()); setMemoryReclaimers(writerInfo_.back().get(), ioStats_.back().get()); @@ -939,7 +940,7 @@ std::pair HiveDataSink::getWriterFileNames( std::pair HiveInsertFileNameGenerator::gen( std::optional bucketId, const std::shared_ptr insertTableHandle, - const ConnectorQueryCtx& connectorQueryCtx, + const connector::common::ConnectorQueryCtx& connectorQueryCtx, bool commitRequired) const { auto targetFileName = insertTableHandle->locationHandle()->targetFileName(); const bool generateFileName = targetFileName.empty(); @@ -1041,7 +1042,7 @@ bool HiveInsertTableHandle::isBucketed() const { } bool HiveInsertTableHandle::isExistingTable() const { - return locationHandle_->tableType() == LocationHandle::TableType::kExisting; + return locationHandle_->tableType() == connector::common::LocationHandle::TableType::kExisting; } folly::dynamic HiveInsertTableHandle::serialize() const { @@ -1061,7 +1062,7 @@ folly::dynamic HiveInsertTableHandle::serialize() const { } if (compressionKind_.has_value()) { - obj["compressionKind"] = common::compressionKindToString(*compressionKind_); + obj["compressionKind"] = velox::common::compressionKindToString(*compressionKind_); } folly::dynamic params = folly::dynamic::object; @@ -1079,14 +1080,14 @@ HiveInsertTableHandlePtr HiveInsertTableHandle::create( auto inputColumns = ISerializable::deserialize>( obj["inputColumns"]); auto locationHandle = - ISerializable::deserialize(obj["locationHandle"]); + ISerializable::deserialize(obj["locationHandle"]); auto storageFormat = dwio::common::toFileFormat(obj["tableStorageFormat"].asString()); - std::optional compressionKind = std::nullopt; + std::optional compressionKind = std::nullopt; if (obj.count("compressionKind") > 0) { compressionKind = - common::stringToCompressionKind(obj["compressionKind"].asString()); + velox::common::stringToCompressionKind(obj["compressionKind"].asString()); } std::shared_ptr bucketProperty; @@ -1125,7 +1126,7 @@ std::string HiveInsertTableHandle::toString() const { std::ostringstream out; out << "HiveInsertTableHandle [" << dwio::common::toString(storageFormat_); if (compressionKind_.has_value()) { - out << " " << common::compressionKindToString(compressionKind_.value()); + out << " " << velox::common::compressionKindToString(compressionKind_.value()); } else { out << " none"; } @@ -1151,7 +1152,7 @@ std::string HiveInsertTableHandle::toString() const { return out.str(); } -std::string LocationHandle::toString() const { +std::string HiveLocationHandle::toString() const { return fmt::format( "LocationHandle [targetPath: {}, writePath: {}, tableType: {}, tableFileName: {}]", targetPath_, @@ -1160,12 +1161,12 @@ std::string LocationHandle::toString() const { targetFileName_); } -void LocationHandle::registerSerDe() { +void HiveLocationHandle::registerSerDe() { auto& registry = DeserializationRegistryForSharedPtr(); - registry.Register("LocationHandle", LocationHandle::create); + registry.Register("LocationHandle", HiveLocationHandle::create); } -folly::dynamic LocationHandle::serialize() const { +folly::dynamic HiveLocationHandle::serialize() const { folly::dynamic obj = folly::dynamic::object; obj["name"] = "LocationHandle"; obj["targetPath"] = targetPath_; @@ -1175,12 +1176,12 @@ folly::dynamic LocationHandle::serialize() const { return obj; } -LocationHandlePtr LocationHandle::create(const folly::dynamic& obj) { +LocationHandlePtr HiveLocationHandle::create(const folly::dynamic& obj) { auto targetPath = obj["targetPath"].asString(); auto writePath = obj["writePath"].asString(); auto tableType = tableTypeFromName(obj["tableType"].asString()); auto targetFileName = obj["targetFileName"].asString(); - return std::make_shared( + return std::make_shared( targetPath, writePath, tableType, targetFileName); } diff --git a/velox/connectors/hive/HiveDataSink.h b/velox/connectors/hive/HiveDataSink.h index c1354b1ca6b3..3a6fe89873e0 100644 --- a/velox/connectors/hive/HiveDataSink.h +++ b/velox/connectors/hive/HiveDataSink.h @@ -16,7 +16,7 @@ #pragma once #include "velox/common/compression/Compression.h" -#include "velox/connectors/Connector.h" +#include "velox/connectors/common/Connector.h" #include "velox/connectors/hive/HiveConfig.h" #include "velox/connectors/hive/PartitionIdGenerator.h" #include "velox/connectors/hive/TableHandle.h" @@ -25,73 +25,74 @@ #include "velox/dwio/common/WriterFactory.h" #include "velox/exec/MemoryReclaimer.h" -namespace facebook::velox::dwrf { -class Writer; -} +// namespace facebook::velox::dwrf { +// class Writer; +// } namespace facebook::velox::connector::hive { -class LocationHandle; -using LocationHandlePtr = std::shared_ptr; - +class HiveLocationHandle; +using HiveLocationHandlePtr = std::shared_ptr; +// /// Location related properties of the Hive table to be written. -class LocationHandle : public ISerializable { - public: - enum class TableType { - /// Write to a new table to be created. - kNew, - /// Write to an existing table. - kExisting, - }; - - LocationHandle( - std::string targetPath, - std::string writePath, - TableType tableType, - std::string targetFileName = "") - : targetPath_(std::move(targetPath)), - targetFileName_(std::move(targetFileName)), - writePath_(std::move(writePath)), - tableType_(tableType) {} - - const std::string& targetPath() const { - return targetPath_; - } - - const std::string& targetFileName() const { - return targetFileName_; - } - - const std::string& writePath() const { - return writePath_; - } - - TableType tableType() const { - return tableType_; - } - - std::string toString() const; - - static void registerSerDe(); - - folly::dynamic serialize() const override; - - static LocationHandlePtr create(const folly::dynamic& obj); - - static const std::string tableTypeName(LocationHandle::TableType type); - - static LocationHandle::TableType tableTypeFromName(const std::string& name); - - private: - // Target directory path. - const std::string targetPath_; - // If non-empty, use this name instead of generating our own. - const std::string targetFileName_; - // Staging directory path. - const std::string writePath_; - // Whether the table to be written is new, already existing or temporary. - const TableType tableType_; -}; +class HiveLocationHandle : public connector::common::LocationHandle { + public: +// enum class TableType { +// /// Write to a new table to be created. +// kNew, +// /// Write to an existing table. +// kExisting, +// }; + + HiveLocationHandle( + std::string targetPath, + std::string writePath, + TableType tableType, + std::string targetFileName = "") + : targetPath_(std::move(targetPath)), + targetFileName_(std::move(targetFileName)), + writePath_(std::move(writePath)), + tableType_(tableType) {} + + const std::string& targetPath() const { + return targetPath_; + } + + const std::string& targetFileName() const { + return targetFileName_; + } + + const std::string& writePath() const { + return writePath_; + } + + TableType tableType() const override { + return tableType_; + } + + std::string toString() const; + + static void registerSerDe(); + + folly::dynamic serialize() const override; + + static HiveLocationHandlePtr create(const folly::dynamic& obj); + + static const std::string tableTypeName(HiveLocationHandle::TableType type); + + static HiveLocationHandle::TableType tableTypeFromName(const std::string& + name); + + private: + // Target directory path. + const std::string targetPath_; + // If non-empty, use this name instead of generating our own. + const std::string targetFileName_; + // Staging directory path. + const std::string writePath_; + // Whether the table to be written is new, already existing or temporary. + const TableType tableType_; + }; class HiveSortingColumn : public ISerializable { public: @@ -201,7 +202,7 @@ class FileNameGenerator : public ISerializable { virtual std::pair gen( std::optional bucketId, const std::shared_ptr insertTableHandle, - const ConnectorQueryCtx& connectorQueryCtx, + const connector::common::ConnectorQueryCtx& connectorQueryCtx, bool commitRequired) const = 0; virtual std::string toString() const = 0; @@ -214,7 +215,7 @@ class HiveInsertFileNameGenerator : public FileNameGenerator { std::pair gen( std::optional bucketId, const std::shared_ptr insertTableHandle, - const ConnectorQueryCtx& connectorQueryCtx, + const connector::common::ConnectorQueryCtx& connectorQueryCtx, bool commitRequired) const override; static void registerSerDe(); @@ -229,14 +230,14 @@ class HiveInsertFileNameGenerator : public FileNameGenerator { }; /// Represents a request for Hive write. -class HiveInsertTableHandle : public ConnectorInsertTableHandle { +class HiveInsertTableHandle : public connector::common::ConnectorInsertTableHandle { public: HiveInsertTableHandle( std::vector> inputColumns, - std::shared_ptr locationHandle, + std::shared_ptr locationHandle, dwio::common::FileFormat storageFormat = dwio::common::FileFormat::DWRF, std::shared_ptr bucketProperty = nullptr, - std::optional compressionKind = {}, + std::optional compressionKind = {}, const std::unordered_map& serdeParameters = {}, const std::shared_ptr& writerOptions = nullptr, @@ -257,7 +258,7 @@ class HiveInsertTableHandle : public ConnectorInsertTableHandle { fileNameGenerator_(std::move(fileNameGenerator)) { if (compressionKind.has_value()) { VELOX_CHECK( - compressionKind.value() != common::CompressionKind_MAX, + compressionKind.value() != velox::common::CompressionKind_MAX, "Unsupported compression type: CompressionKind_MAX"); } @@ -284,11 +285,11 @@ class HiveInsertTableHandle : public ConnectorInsertTableHandle { return inputColumns_; } - const std::shared_ptr& locationHandle() const { + const std::shared_ptr& locationHandle() const { return locationHandle_; } - std::optional compressionKind() const { + std::optional compressionKind() const { return compressionKind_; } @@ -334,10 +335,10 @@ class HiveInsertTableHandle : public ConnectorInsertTableHandle { private: const std::vector> inputColumns_; - const std::shared_ptr locationHandle_; + const std::shared_ptr locationHandle_; const dwio::common::FileFormat storageFormat_; const std::shared_ptr bucketProperty_; - const std::optional compressionKind_; + const std::optional compressionKind_; const std::unordered_map serdeParameters_; const std::shared_ptr writerOptions_; const bool ensureFiles_; @@ -435,7 +436,7 @@ struct HiveWriterInfo { std::shared_ptr _sortPool) : writerParameters(std::move(parameters)), nonReclaimableSectionHolder(new tsan_atomic(false)), - spillStats(std::make_unique>()), + spillStats(std::make_unique>()), writerPool(std::move(_writerPool)), sinkPool(std::move(_sinkPool)), sortPool(std::move(_sortPool)) {} @@ -444,7 +445,7 @@ struct HiveWriterInfo { const std::unique_ptr> nonReclaimableSectionHolder; /// Collects the spill stats from sort writer if the spilling has been /// triggered. - const std::unique_ptr> spillStats; + const std::unique_ptr> spillStats; const std::shared_ptr writerPool; const std::shared_ptr sinkPool; const std::shared_ptr sortPool; @@ -490,7 +491,7 @@ struct HiveWriterIdEq { } }; -class HiveDataSink : public DataSink { +class HiveDataSink : public connector::common::DataSink { public: /// The list of runtime stats reported by hive data sink static constexpr const char* kEarlyFlushedRawBytes = "earlyFlushedRawBytes"; @@ -512,15 +513,15 @@ class HiveDataSink : public DataSink { HiveDataSink( RowTypePtr inputType, std::shared_ptr insertTableHandle, - const ConnectorQueryCtx* connectorQueryCtx, - CommitStrategy commitStrategy, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, + connector::common::CommitStrategy commitStrategy, const std::shared_ptr& hiveConfig); HiveDataSink( RowTypePtr inputType, std::shared_ptr insertTableHandle, - const ConnectorQueryCtx* connectorQueryCtx, - CommitStrategy commitStrategy, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, + connector::common::CommitStrategy commitStrategy, const std::shared_ptr& hiveConfig, uint32_t bucketCount, std::unique_ptr bucketFunction); @@ -598,7 +599,7 @@ class HiveDataSink : public DataSink { } FOLLY_ALWAYS_INLINE bool isCommitRequired() const { - return commitStrategy_ != CommitStrategy::kNoCommit; + return commitStrategy_ != connector::common::CommitStrategy::kNoCommit; } std::shared_ptr createWriterPool( @@ -660,8 +661,8 @@ class HiveDataSink : public DataSink { const RowTypePtr inputType_; const std::shared_ptr insertTableHandle_; - const ConnectorQueryCtx* const connectorQueryCtx_; - const CommitStrategy commitStrategy_; + const connector::common::ConnectorQueryCtx* const connectorQueryCtx_; + const connector::common::CommitStrategy commitStrategy_; const std::shared_ptr hiveConfig_; const HiveWriterParameters::UpdateMode updateMode_; const uint32_t maxOpenWriters_; @@ -672,7 +673,7 @@ class HiveDataSink : public DataSink { const int32_t bucketCount_{0}; const std::unique_ptr bucketFunction_; const std::shared_ptr writerFactory_; - const common::SpillConfig* const spillConfig_; + const velox::common::SpillConfig* const spillConfig_; const uint64_t sortWriterFinishTimeSliceLimitMs_{0}; std::vector sortColumnIndices_; @@ -727,13 +728,13 @@ struct fmt::formatter } }; -template <> -struct fmt::formatter< - facebook::velox::connector::hive::LocationHandle::TableType> - : formatter { - auto format( - facebook::velox::connector::hive::LocationHandle::TableType s, - format_context& ctx) const { - return formatter::format(static_cast(s), ctx); - } -}; +//template <> +//struct fmt::formatter< +// facebook::velox::connector::common::LocationHandle::TableType> +// : formatter { +// auto format( +// facebook::velox::connector::common::LocationHandle::TableType s, +// format_context& ctx) const { +// return formatter::format(static_cast(s), ctx); +// } +//}; diff --git a/velox/connectors/hive/HiveDataSource.cpp b/velox/connectors/hive/HiveDataSource.cpp index a092e34f8904..261b89b06628 100644 --- a/velox/connectors/hive/HiveDataSource.cpp +++ b/velox/connectors/hive/HiveDataSource.cpp @@ -58,13 +58,14 @@ bool shouldEagerlyMaterialize( HiveDataSource::HiveDataSource( const RowTypePtr& outputType, - const std::shared_ptr& tableHandle, + const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& + columnHandles, FileHandleFactory* fileHandleFactory, folly::Executor* executor, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, const std::shared_ptr& hiveConfig) : fileHandleFactory_(fileHandleFactory), executor_(executor), @@ -198,7 +199,7 @@ HiveDataSource::HiveDataSource( connectorQueryCtx_->sessionProperties()), pool_); if (remainingFilter) { - metadataFilter_ = std::make_shared( + metadataFilter_ = std::make_shared( *scanSpec_, *remainingFilter, expressionEvaluator_); } @@ -297,7 +298,8 @@ void HiveDataSource::setupRowIdColumn() { connectorQueryCtx_->memoryPool()); } -void HiveDataSource::addSplit(std::shared_ptr split) { +void HiveDataSource::addSplit( + std::shared_ptr split) { VELOX_CHECK_NULL( split_, "Previous split has not been processed yet. Call next to process the split."); @@ -463,7 +465,7 @@ std::optional HiveDataSource::next( void HiveDataSource::addDynamicFilter( column_index_t outputChannel, - const std::shared_ptr& filter) { + const std::shared_ptr& filter) { auto& fieldSpec = scanSpec_->getChildByChannel(outputChannel); fieldSpec.addFilter(*filter); scanSpec_->resetCachedValues(true); @@ -531,7 +533,7 @@ std::unordered_map HiveDataSource::runtimeStats() { } void HiveDataSource::setFromDataSource( - std::unique_ptr sourceUnique) { + std::unique_ptr sourceUnique) { auto source = dynamic_cast(sourceUnique.get()); VELOX_CHECK_NOT_NULL(source, "Bad DataSource type"); diff --git a/velox/connectors/hive/HiveDataSource.h b/velox/connectors/hive/HiveDataSource.h index 01d496a6c521..42ff34e6d477 100644 --- a/velox/connectors/hive/HiveDataSource.h +++ b/velox/connectors/hive/HiveDataSource.h @@ -15,10 +15,10 @@ */ #pragma once +#include "velox/connectors/common/Connector.h" #include "velox/common/base/RandomUtil.h" #include "velox/common/file/FileSystems.h" #include "velox/common/io/IoStatistics.h" -#include "velox/connectors/Connector.h" #include "velox/connectors/hive/FileHandle.h" #include "velox/connectors/hive/HiveConnectorSplit.h" #include "velox/connectors/hive/HiveConnectorUtil.h" @@ -33,27 +33,27 @@ namespace facebook::velox::connector::hive { class HiveConfig; -class HiveDataSource : public DataSource { +class HiveDataSource : public connector::common::DataSource { public: HiveDataSource( const RowTypePtr& outputType, - const std::shared_ptr& tableHandle, + const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& columnHandles, FileHandleFactory* fileHandleFactory, folly::Executor* executor, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, const std::shared_ptr& hiveConfig); - void addSplit(std::shared_ptr split) override; + void addSplit(std::shared_ptr split) override; std::optional next(uint64_t size, velox::ContinueFuture& future) override; void addDynamicFilter( column_index_t outputChannel, - const std::shared_ptr& filter) override; + const std::shared_ptr& filter) override; uint64_t getCompletedBytes() override { return ioStats_->rawBytesRead(); @@ -69,7 +69,7 @@ class HiveDataSource : public DataSource { return splitReader_ && splitReader_->allPrefetchIssued(); } - void setFromDataSource(std::unique_ptr sourceUnique) override; + void setFromDataSource(std::unique_ptr sourceUnique) override; int64_t estimatedRowSize() override; @@ -78,23 +78,23 @@ class HiveDataSource : public DataSource { using WaveDelegateHookFunction = std::function( const std::shared_ptr& hiveTableHandle, - const std::shared_ptr& scanSpec, + const std::shared_ptr& scanSpec, const RowTypePtr& readerOutputType, std::unordered_map>* partitionKeys, FileHandleFactory* fileHandleFactory, folly::Executor* executor, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, const std::shared_ptr& hiveConfig, const std::shared_ptr& ioStats, const exec::ExprSet* remainingFilter, - std::shared_ptr metadataFilter)>; + std::shared_ptr metadataFilter)>; static WaveDelegateHookFunction waveDelegateHook_; static void registerWaveDelegateHook(WaveDelegateHookFunction hook); - const ConnectorQueryCtx* testingConnectorQueryCtx() const { + const connector::common::ConnectorQueryCtx* testingConnectorQueryCtx() const { return connectorQueryCtx_; } @@ -103,13 +103,13 @@ class HiveDataSource : public DataSource { FileHandleFactory* const fileHandleFactory_; folly::Executor* const executor_; - const ConnectorQueryCtx* const connectorQueryCtx_; + const connector::common::ConnectorQueryCtx* const connectorQueryCtx_; const std::shared_ptr hiveConfig_; memory::MemoryPool* const pool_; std::shared_ptr split_; std::shared_ptr hiveTableHandle_; - std::shared_ptr scanSpec_; + std::shared_ptr scanSpec_; VectorPtr output_; std::unique_ptr splitReader_; @@ -159,11 +159,11 @@ class HiveDataSource : public DataSource { std::unordered_map> infoColumns_; SpecialColumnNames specialColumns_{}; - std::vector remainingFilterSubfields_; - folly::F14FastMap> + std::vector remainingFilterSubfields_; + folly::F14FastMap> subfields_; - common::SubfieldFilters filters_; - std::shared_ptr metadataFilter_; + velox::common::SubfieldFilters filters_; + std::shared_ptr metadataFilter_; std::unique_ptr remainingFilterExprSet_; RowVectorPtr emptyOutput_; dwio::common::RuntimeStatistics runtimeStats_; diff --git a/velox/connectors/hive/HiveObjectFactory.cpp b/velox/connectors/hive/HiveObjectFactory.cpp new file mode 100644 index 000000000000..4d7002d6a7ca --- /dev/null +++ b/velox/connectors/hive/HiveObjectFactory.cpp @@ -0,0 +1,353 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/connectors/hive/HiveObjectFactory.h" + +#include + +#include + +#include "velox/connectors/common/Connector.h" +#include "velox/connectors/common/ConnectorNames.h" +#include "velox/connectors/common/ConnectorObjectFactory.h" +#include "velox/connectors/hive/HiveConnectorSplit.h" +#include "velox/connectors/hive/HiveDataSink.h" +#include "velox/connectors/hive/TableHandle.h" // HiveTableHandle +#include "velox/core/Expressions.h" +#include "velox/type/Filter.h" +#include "velox/type/Type.h" + +namespace facebook::velox::connector::hive { + +using namespace velox::common; +using namespace facebook::velox::connector::common; + +std::shared_ptr HiveObjectFactory::makeConnectorSplit( + const std::string& connectorId, + const std::string& filePath, + uint64_t start, + uint64_t length, + const folly::dynamic& options) const { + auto builder = HiveConnectorSplitBuilder(filePath) + .start(start) + .length(length) + .connectorId(connectorId); + + if (options.count("fileFormat")) { + builder.fileFormat( + static_cast(options["fileFormat"].asInt())); + } + + if (options.count("splitWeight")) { + builder.splitWeight(options["splitWeight"].asInt()); + } + + if (options.count("cacheable")) { + builder.cacheable(options["cacheable"].asBool()); + } + + if (options.count("infoColumns")) { + for (auto& kv : options["infoColumns"].items()) { + builder.infoColumn(kv.first.asString(), kv.second.asString()); + } + } + + if (options.count("partitionKeys")) { + for (auto& kv : options["partitionKeys"].items()) { + builder.partitionKey( + kv.first.asString(), + kv.second.isNull() + ? std::nullopt + : std::optional(kv.second.asString())); + } + } + + if (options.count("tableBucketNumber")) { + builder.tableBucketNumber(options["tableBucketNumber"].asInt()); + } + + if (options.count("bucketConversion")) { + HiveBucketConversion bucketConversion; + const auto& bucketConversionOption = options["bucketConversion"]; + bucketConversion.tableBucketCount = + bucketConversionOption["tableBucketCount"].asInt(); + bucketConversion.partitionBucketCount = + bucketConversionOption["partitionBucketCount"].asInt(); + for (auto& bucketColumnHandlesOption : + bucketConversionOption["bucketColumnHandles"]) { + bucketConversion.bucketColumnHandles.push_back( + std::const_pointer_cast( + ISerializable::deserialize( + bucketColumnHandlesOption))); + } + builder.bucketConversion(bucketConversion); + } + + if (options.count("customSplitInfo")) { + std::unordered_map info; + for (auto& kv : options["customSplitInfo"].items()) { + info[kv.first.asString()] = kv.second.asString(); + } + builder.customSplitInfo(info); + } + + if (options.count("extraFileInfo")) { + auto extra = options["extraFileInfo"].isNull() + ? std::shared_ptr() + : std::make_shared(options["extraFileInfo"].asString()); + builder.extraFileInfo(extra); + } + + if (options.count("serdeParameters")) { + std::unordered_map serde; + for (auto& kv : options["serdeParameters"].items()) { + serde[kv.first.asString()] = kv.second.asString(); + } + builder.serdeParameters(serde); + } + + if (options.count("storageParameters")) { + std::unordered_map storage; + for (auto& kv : options["storageParameters"].items()) { + storage[kv.first.asString()] = kv.second.asString(); + } + builder.storageParameters(storage); + } + + if (options.count("properties")) { + FileProperties props; + const auto& propertiesOption = options["properties"]; + if (propertiesOption.count("fileSize") && + !propertiesOption["fileSize"].isNull()) { + props.fileSize = propertiesOption["fileSize"].asInt(); + } + if (propertiesOption.count("modificationTime") && + !propertiesOption["modificationTime"].isNull()) { + props.modificationTime = propertiesOption["modificationTime"].asInt(); + } + builder.fileProperties(props); + } + + if (options.count("rowIdProperties")) { + RowIdProperties rowIdProperties; + const auto& rowIdPropertiesOption = options["rowIdProperties"]; + rowIdProperties.metadataVersion = + rowIdPropertiesOption["metadataVersion"].asInt(); + rowIdProperties.partitionId = rowIdPropertiesOption["partitionId"].asInt(); + rowIdProperties.tableGuid = rowIdPropertiesOption["tableGuid"].asString(); + builder.rowIdProperties(rowIdProperties); + } + + return builder.build(); +} + +std::shared_ptr HiveObjectFactory::makeTableHandle( + const std::string& connectorId, + const std::string& tableName, + const RowTypePtr& dataColumns, + const folly::dynamic& options) const { + bool pushdown = + options.getDefault("filterowIdPropertiesushdownEnabled", true).asBool(); + auto subfields = options.count("subfieldFilters") + ? SubfieldFilters::fromDynamic(options["subfieldFilters"]) + : SubfieldFilters{}; + auto remaining = options.count("remainingFilter") + ? deserializeTypedExpr(options["remainingFilter"]) + : core::TypedExprowIdPropertiestr{}; + + std::unordered_map tableParams; + if (options.count("tableParameters")) { + for (auto& kv : options["tableParameters"].items()) { + tableParams[kv.first.asString()] = kv.second.asString(); + } + } + + return std::make_shared( + connectorId, + tableName, + pushdown, + std::move(subfields), + remaining, + dataColumns, + tableParams); +} + +std::shared_ptr +HiveObjectFactory::makeInsertTableHandle( + const std::string& connectorId, + const std::vehiveColumnTypeor& colNames, + const std::vehiveColumnTypeor& colTypes, + std::shared_ptr locHandle, + const std::optional codec, + const folly::dynamic& options = {}) const { + // Pack connector-specific options into a dynamic map + folly::dynamic options = + folly::dynamic::object("partitionedBy", folly::dynamic::array())( + "serdeParameters", folly::dynamic::object())( + "fileFormat", static_cast(tableStorageFormat))( + "ensureFiles", ensureFiles); + + for (const auto& col : partitionedBy) { + options["partitionedBy"].push_back(col); + } + + for (auto& kv : serdeParameters) { + options["serdeParameters"][kv.first] = kv.second; + } + + if (writerOptions) { + options["writerOptions"] = writerOptions; + } + + return fahiveColumnTypeory_->makeInsertTableHandle( + tableColumnNames, + tableColumnTypes, + std::move(locationHandle), + compressionKind, + options); +} +b.start(start).length(length); +if (options.count("splitWeight")) { + b.splitWeight(options["splitWeight"].asInt()); +} +if (options.count("cacheable")) { + b.cacheable(options["cacheable"].asBool()); +} +if (options.count("infoColumns")) { + for (auto& kv : options["infoColumns"].items()) { + b.infoColumn(kv.first.asString(), kv.second.asString()); + } +} +if (options.count("partitionKeys")) { + for (auto& kv : options["partitionKeys"].items()) { + b.partitionKey( + kv.first.asString(), + kv.second.isNull() ? std::nullopt + : std::optional(kv.second.asString())); + } +} +if (options.count("tableBucketNumber")) { + b.tableBucketNumber(options["tableBucketNumber"].asInt()); +} +if (options.count("bucketConversion")) { + const auto& bcDyn = options["bucketConversion"]; + HiveBucketConversion bc; + bc.tableBucketCount = bcDyn["tableBucketCount"].asInt(); + bc.partitionBucketCount = bcDyn["partitionBucketCount"].asInt(); + for (auto& hDyn : bcDyn["bucketColumnHandles"]) { + bc.bucketColumnHandles.push_back( + std::const_pointer_cast( + facebook::velox::ISerializable::deserialize( + hDyn))); + } + b.bucketConversion(bc); +} +if (options.count("customSplitInfo")) { + std::unordered_map info; + for (auto& kv : options["customSplitInfo"].items()) { + info[kv.first.asString()] = kv.second.asString(); + } + b.customSplitInfo(info); +} +if (options.count("extraFileInfo")) { + auto extra = options["extraFileInfo"].isNull() + ? std::shared_ptr() + : std::make_shared(options["extraFileInfo"].asString()); + b.extraFileInfo(extra); +} +if (options.count("serdeParameters")) { + std::unordered_map serde; + for (auto& kv : options["serdeParameters"].items()) { + serde[kv.first.asString()] = kv.second.asString(); + } + b.serdeParameters(serde); +} +if (options.count("storageParameters")) { + std::unordered_map storage; + for (auto& kv : options["storageParameters"].items()) { + storage[kv.first.asString()] = kv.second.asString(); + } + b.storageParameters(storage); +} +if (options.count("properties")) { + FileProperties props; + const auto& pDyn = options["properties"]; + if (pDyn.count("fileSize") && !pDyn["fileSize"].isNull()) { + props.fileSize = pDyn["fileSize"].asInt(); + } + if (pDyn.count("modificationTime") && !pDyn["modificationTime"].isNull()) { + props.modificationTime = pDyn["modificationTime"].asInt(); + } + b.fileProperties(props); +} +if (options.count("rowIdProperties")) { + RowIdProperties rp; + const auto& rDyn = options["rowIdProperties"]; + rp.metadataVersion = rDyn["metadataVersion"].asInt(); + rp.partitionId = rDyn["partitionId"].asInt(); + rp.tableGuid = rDyn["tableGuid"].asString(); + b.rowIdProperties(rp); +} +return b.build(); +} + +std::unique_ptr HiveObjectFactory::makeColumnHandle( + const std::string& connectorId, + const std::string& name, + const TypePtr& dataType, + const folly::dynamic& options) const { + using HiveColumnType = hive::HiveColumnHandle::ColumnType; + HiveColumnType hiveColumnType = HiveColumnType::kRegular; + if (options.count("columnType")) { + auto str = options.getDefault("columnType", "regular").asString(); + + if (str == "partition_key") { + hiveColumnType = HiveColumnType::kPartitionKey; + } else if (str == "synthesized") { + hiveColumnType = HiveColumnType::kSynthesized; + } else if (str == "row_index") { + hiveColumnType = HiveColumnType::kRowIndex; + } else if (str == "row_id") { + hiveColumnType = HiveColumnType::kRowId; + } + } + + auto hiveType = velox::ISerializable::deserialize(options["hiveType"]); + + std::vector subfields; + if (options.count("requiredSubfields")) { + for (auto& v : options["requiredSubfields"]) { + subfields.push_back(v.asString()); + } + } + + return std::make_unique( + name, columnType, dataType, hiveType, std::move(subfields)); +} + +std::shared_ptr HiveObjectFactory::makeLocationHandle( + const std::string& connectorId, + std::string targetDirectory, + std::optional writeDirectory, + LocationHandle::TableType tableType) const { + return std::make_shared( + std::move(targetDirectory), + writeDirectory.value_or(targetDirectory), + tableType); +} +} +; + +} // namespace facebook::velox::connector::hive diff --git a/velox/connectors/hive/HiveObjectFactory.h b/velox/connectors/hive/HiveObjectFactory.h new file mode 100644 index 000000000000..e6b2fe7a7b49 --- /dev/null +++ b/velox/connectors/hive/HiveObjectFactory.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/connectors/common/ConnectorObjectFactory.h" + +namespace facebook::velox::connector::hive { + +class HiveObjectFactory : public connector::common::ConnectorObjectFactory { + public: + ~HiveObjectFactory() override = default; + + std::shared_ptr makeConnectorSplit( + const std::string& connectorId, + const std::string& filePath, + uint64_t start, + uint64_t length, + const folly::dynamic& options = {}) const override; + + std::shared_ptr makeTableHandle( + const std::string& connectorId, + const std::string& tableName, + const RowTypePtr& dataColumns = nullptr, + const folly::dynamic& options = {}) const override; + + std::shared_ptr + makeInsertTableHandle( + const std::string& connectorId, + const std::vector& tableColumnNames, + const std::vector& tableColumnTypes, + std::shared_ptr locationHandle, + const std::optional compressionKind, + const folly::dynamic& options = {}) const override; + + std::unique_ptr makeColumnHandle( + const std::string& connectorId, + const std::string& name, + const TypePtr& type, + const folly::dynamic& options) const override; + + std::shared_ptr makeLocationHandle( + const std::string& connectorId, + std::string targetDirectory, + std::optional writeDirectory = std::nullopt, + connector::common::LocationHandle::TableType tableType = + connector::common::LocationHandle::TableType::kNew) const override; +}; + +} // namespace facebook::velox::connector::hive diff --git a/velox/connectors/hive/HivePlugin.cpp b/velox/connectors/hive/HivePlugin.cpp new file mode 100644 index 000000000000..275e2ba0f013 --- /dev/null +++ b/velox/connectors/hive/HivePlugin.cpp @@ -0,0 +1,37 @@ +/* +* Copyright (c) Facebook, Inc. and its affiliates. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include "velox/connectors/common/Connector.h" // for connectorObjectFactories() +#include "velox/connectors/common/ConnectorNames.h" // for kHiveConnectorName +#include "velox/connectors/hive/HiveConnectorObjectFactory.h" + +extern "C" void registerConnectorPlugin() { + using namespace facebook::velox::connector::common; + using namespace facebook::velox::connector::hive; + + connectorFactories().emplace( + kHiveConnectorName, std::make_shared()); + + connectorObjectFactories().emplace( + kHiveConnectorName, std::make_unique()); +} +} + +// Force registration even if someone links this .so directly +static bool _hivePluginRegistered = []() { + registerConnectorPlugin(); + return true; +}(); diff --git a/velox/connectors/hive/SplitReader.cpp b/velox/connectors/hive/SplitReader.cpp index 11cc8322dd5a..241771f0c031 100644 --- a/velox/connectors/hive/SplitReader.cpp +++ b/velox/connectors/hive/SplitReader.cpp @@ -80,14 +80,14 @@ std::unique_ptr SplitReader::create( const std::shared_ptr& hiveTableHandle, const std::unordered_map>* partitionKeys, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, const std::shared_ptr& hiveConfig, const RowTypePtr& readerOutputType, const std::shared_ptr& ioStats, const std::shared_ptr& fsStats, FileHandleFactory* fileHandleFactory, folly::Executor* executor, - const std::shared_ptr& scanSpec) { + const std::shared_ptr& scanSpec) { // Create the SplitReader based on hiveSplit->customSplitInfo["table_format"] if (hiveSplit->customSplitInfo.count("table_format") > 0 && hiveSplit->customSplitInfo["table_format"] == "hive-iceberg") { @@ -124,14 +124,14 @@ SplitReader::SplitReader( const std::shared_ptr& hiveTableHandle, const std::unordered_map>* partitionKeys, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, const std::shared_ptr& hiveConfig, const RowTypePtr& readerOutputType, const std::shared_ptr& ioStats, const std::shared_ptr& fsStats, FileHandleFactory* fileHandleFactory, folly::Executor* executor, - const std::shared_ptr& scanSpec) + const std::shared_ptr& scanSpec) : hiveSplit_(hiveSplit), hiveTableHandle_(hiveTableHandle), partitionKeys_(partitionKeys), @@ -161,7 +161,7 @@ void SplitReader::configureReaderOptions( } void SplitReader::prepareSplit( - std::shared_ptr metadataFilter, + std::shared_ptr metadataFilter, dwio::common::RuntimeStatistics& runtimeStats) { createReader(); if (emptySplit_) { @@ -202,11 +202,11 @@ void SplitReader::resetSplit() { int64_t SplitReader::estimatedRowSize() const { if (!baseRowReader_) { - return DataSource::kUnknownRowSize; + return connector::common::DataSource::kUnknownRowSize; } const auto size = baseRowReader_->estimatedRowSize(); - return size.value_or(DataSource::kUnknownRowSize); + return size.value_or(connector::common::DataSource::kUnknownRowSize); } void SplitReader::updateRuntimeStats( @@ -221,7 +221,7 @@ bool SplitReader::allPrefetchIssued() const { } void SplitReader::setConnectorQueryCtx( - const ConnectorQueryCtx* connectorQueryCtx) { + const connector::common::ConnectorQueryCtx* connectorQueryCtx) { connectorQueryCtx_ = connectorQueryCtx; } @@ -321,7 +321,7 @@ bool SplitReader::checkIfSplitIsEmpty( } void SplitReader::createRowReader( - std::shared_ptr metadataFilter, + std::shared_ptr metadataFilter, RowTypePtr rowType) { VELOX_CHECK_NULL(baseRowReader_); configureRowReaderOptions( @@ -366,7 +366,7 @@ std::vector SplitReader::adaptColumns( connectorQueryCtx_->sessionProperties())); childSpec->setConstantValue(constant); } else if ( - childSpec->columnType() == common::ScanSpec::ColumnType::kRegular) { + childSpec->columnType() == velox::common::ScanSpec::ColumnType::kRegular) { auto fileTypeIdx = fileType->getChildIdxIfExists(fieldName); if (!fileTypeIdx.has_value()) { // Column is missing. Most likely due to schema evolution. @@ -407,7 +407,7 @@ std::vector SplitReader::adaptColumns( } void SplitReader::setPartitionValue( - common::ScanSpec* spec, + velox::common::ScanSpec* spec, const std::string& partitionKey, const std::optional& value) const { auto it = partitionKeys_->find(partitionKey); diff --git a/velox/connectors/hive/SplitReader.h b/velox/connectors/hive/SplitReader.h index 6987f94b40c0..a8a427181c0b 100644 --- a/velox/connectors/hive/SplitReader.h +++ b/velox/connectors/hive/SplitReader.h @@ -33,9 +33,9 @@ class MetadataFilter; class ScanSpec; } // namespace facebook::velox::common -namespace facebook::velox::connector { +namespace facebook::velox::connector::common { class ConnectorQueryCtx; -} // namespace facebook::velox::connector +} // namespace facebook::velox::connector::common namespace facebook::velox::dwio::common { struct RuntimeStatistics; @@ -59,14 +59,14 @@ class SplitReader { const std::shared_ptr& hiveTableHandle, const std::unordered_map>* partitionKeys, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, const std::shared_ptr& hiveConfig, const RowTypePtr& readerOutputType, const std::shared_ptr& ioStats, const std::shared_ptr& fsStats, FileHandleFactory* fileHandleFactory, folly::Executor* executor, - const std::shared_ptr& scanSpec); + const std::shared_ptr& scanSpec); virtual ~SplitReader() = default; @@ -78,7 +78,7 @@ class SplitReader { /// files or log files, and add column adapatations for metadata columns. It /// would be called only once per incoming split virtual void prepareSplit( - std::shared_ptr metadataFilter, + std::shared_ptr metadataFilter, dwio::common::RuntimeStatistics& runtimeStats); virtual uint64_t next(uint64_t size, VectorPtr& output); @@ -95,7 +95,7 @@ class SplitReader { bool allPrefetchIssued() const; - void setConnectorQueryCtx(const ConnectorQueryCtx* connectorQueryCtx); + void setConnectorQueryCtx(const connector::common::ConnectorQueryCtx* connectorQueryCtx); const RowTypePtr& readerOutputType() const { return readerOutputType_; @@ -109,14 +109,14 @@ class SplitReader { const std::shared_ptr& hiveTableHandle, const std::unordered_map>* partitionKeys, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, const std::shared_ptr& hiveConfig, const RowTypePtr& readerOutputType, const std::shared_ptr& ioStats, const std::shared_ptr& fsStats, FileHandleFactory* fileHandleFactory, folly::Executor* executor, - const std::shared_ptr& scanSpec); + const std::shared_ptr& scanSpec); /// Create the dwio::common::Reader object baseReader_, which will be used to /// read the data file's metadata and schema @@ -142,7 +142,7 @@ class SplitReader { /// Create the dwio::common::RowReader object baseRowReader_, which owns the /// ColumnReaders that will be used to read the data void createRowReader( - std::shared_ptr metadataFilter, + std::shared_ptr metadataFilter, RowTypePtr rowType); private: @@ -153,7 +153,7 @@ class SplitReader { const std::shared_ptr& tableSchema) const; void setPartitionValue( - common::ScanSpec* spec, + velox::common::ScanSpec* spec, const std::string& partitionKey, const std::optional& value) const; @@ -163,7 +163,7 @@ class SplitReader { const std::unordered_map< std::string, std::shared_ptr>* const partitionKeys_; - const ConnectorQueryCtx* connectorQueryCtx_; + const connector::common::ConnectorQueryCtx* connectorQueryCtx_; const std::shared_ptr hiveConfig_; RowTypePtr readerOutputType_; @@ -173,7 +173,7 @@ class SplitReader { folly::Executor* const executor_; memory::MemoryPool* const pool_; - std::shared_ptr scanSpec_; + std::shared_ptr scanSpec_; std::unique_ptr baseReader_; std::unique_ptr baseRowReader_; dwio::common::ReaderOptions baseReaderOpts_; diff --git a/velox/connectors/hive/TableHandle.cpp b/velox/connectors/hive/TableHandle.cpp index c8ba5625e8cf..2134281aced9 100644 --- a/velox/connectors/hive/TableHandle.cpp +++ b/velox/connectors/hive/TableHandle.cpp @@ -53,7 +53,7 @@ HiveColumnHandle::ColumnType HiveColumnHandle::columnTypeFromName( } folly::dynamic HiveColumnHandle::serialize() const { - folly::dynamic obj = ConnectorColumnHandle::serializeBase("HiveColumnHandle"); + folly::dynamic obj = connector::common::ConnectorColumnHandle::serializeBase("HiveColumnHandle"); obj["hiveColumnHandleName"] = name_; obj["columnType"] = columnTypeName(columnType_); obj["dataType"] = dataType_->serialize(); @@ -81,14 +81,14 @@ std::string HiveColumnHandle::toString() const { return out.str(); } -ConnectorColumnHandlePtr HiveColumnHandle::create(const folly::dynamic& obj) { +connector::common::ConnectorColumnHandlePtr HiveColumnHandle::create(const folly::dynamic& obj) { auto name = obj["hiveColumnHandleName"].asString(); auto columnType = columnTypeFromName(obj["columnType"].asString()); auto dataType = ISerializable::deserialize(obj["dataType"]); auto hiveType = ISerializable::deserialize(obj["hiveType"]); const auto& arr = obj["requiredSubfields"]; - std::vector requiredSubfields; + std::vector requiredSubfields; requiredSubfields.reserve(arr.size()); for (auto& s : arr) { requiredSubfields.emplace_back(s.asString()); @@ -107,11 +107,11 @@ HiveTableHandle::HiveTableHandle( std::string connectorId, const std::string& tableName, bool filterPushdownEnabled, - common::SubfieldFilters subfieldFilters, + velox::common::SubfieldFilters subfieldFilters, const core::TypedExprPtr& remainingFilter, const RowTypePtr& dataColumns, const std::unordered_map& tableParameters) - : ConnectorTableHandle(std::move(connectorId)), + : connector::common::ConnectorTableHandle(std::move(connectorId)), tableName_(tableName), filterPushdownEnabled_(filterPushdownEnabled), subfieldFilters_(std::move(subfieldFilters)), @@ -124,7 +124,7 @@ std::string HiveTableHandle::toString() const { out << "table: " << tableName_; if (!subfieldFilters_.empty()) { // Sort filters by subfield for deterministic output. - std::map orderedFilters; + std::map orderedFilters; for (const auto& [field, filter] : subfieldFilters_) { orderedFilters[field.toString()] = filter.get(); } @@ -163,7 +163,7 @@ std::string HiveTableHandle::toString() const { } folly::dynamic HiveTableHandle::serialize() const { - folly::dynamic obj = ConnectorTableHandle::serializeBase("HiveTableHandle"); + folly::dynamic obj = connector::common::ConnectorTableHandle::serializeBase("HiveTableHandle"); obj["tableName"] = tableName_; obj["filterPushdownEnabled"] = filterPushdownEnabled_; @@ -191,7 +191,7 @@ folly::dynamic HiveTableHandle::serialize() const { return obj; } -ConnectorTableHandlePtr HiveTableHandle::create( +connector::common::ConnectorTableHandlePtr HiveTableHandle::create( const folly::dynamic& obj, void* context) { auto connectorId = obj["connectorId"].asString(); @@ -204,13 +204,13 @@ ConnectorTableHandlePtr HiveTableHandle::create( ISerializable::deserialize(it->second, context); } - common::SubfieldFilters subfieldFilters; + velox::common::SubfieldFilters subfieldFilters; folly::dynamic subfieldFiltersObj = obj["subfieldFilters"]; for (const auto& subfieldFilter : subfieldFiltersObj) { - common::Subfield subfield(subfieldFilter["subfield"].asString()); + velox::common::Subfield subfield(subfieldFilter["subfield"].asString()); auto filter = - ISerializable::deserialize(subfieldFilter["filter"]); - subfieldFilters[common::Subfield(std::move(subfield.path()))] = + ISerializable::deserialize(subfieldFilter["filter"]); + subfieldFilters[velox::common::Subfield(std::move(subfield.path()))] = filter->clone(); } diff --git a/velox/connectors/hive/TableHandle.h b/velox/connectors/hive/TableHandle.h index 0a6446061866..4c59e9a7c2e1 100644 --- a/velox/connectors/hive/TableHandle.h +++ b/velox/connectors/hive/TableHandle.h @@ -15,7 +15,7 @@ */ #pragma once -#include "velox/connectors/Connector.h" +#include "velox/connectors/common/Connector.h" #include "velox/core/ITypedExpr.h" #include "velox/type/Filter.h" #include "velox/type/Subfield.h" @@ -23,7 +23,7 @@ namespace facebook::velox::connector::hive { -class HiveColumnHandle : public ConnectorColumnHandle { +class HiveColumnHandle : public connector::common::ConnectorColumnHandle { public: enum class ColumnType { kPartitionKey, @@ -52,7 +52,7 @@ class HiveColumnHandle : public ConnectorColumnHandle { ColumnType columnType, TypePtr dataType, TypePtr hiveType, - std::vector requiredSubfields = {}, + std::vector requiredSubfields = {}, ColumnParseParameters columnParseParameters = {}) : name_(name), columnType_(columnType), @@ -97,7 +97,7 @@ class HiveColumnHandle : public ConnectorColumnHandle { /// /// Pruning arrays means dropping values with indices larger than maximum /// required index. - const std::vector& requiredSubfields() const { + const std::vector& requiredSubfields() const { return requiredSubfields_; } @@ -114,7 +114,7 @@ class HiveColumnHandle : public ConnectorColumnHandle { folly::dynamic serialize() const override; - static ConnectorColumnHandlePtr create(const folly::dynamic& obj); + static connector::common::ConnectorColumnHandlePtr create(const folly::dynamic& obj); static std::string columnTypeName(HiveColumnHandle::ColumnType columnType); @@ -128,17 +128,17 @@ class HiveColumnHandle : public ConnectorColumnHandle { const ColumnType columnType_; const TypePtr dataType_; const TypePtr hiveType_; - const std::vector requiredSubfields_; + const std::vector requiredSubfields_; const ColumnParseParameters columnParseParameters_; }; -class HiveTableHandle : public ConnectorTableHandle { +class HiveTableHandle : public connector::common::ConnectorTableHandle { public: HiveTableHandle( std::string connectorId, const std::string& tableName, bool filterPushdownEnabled, - common::SubfieldFilters subfieldFilters, + velox::common::SubfieldFilters subfieldFilters, const core::TypedExprPtr& remainingFilter, const RowTypePtr& dataColumns = nullptr, const std::unordered_map& tableParameters = {}); @@ -155,7 +155,7 @@ class HiveTableHandle : public ConnectorTableHandle { return filterPushdownEnabled_; } - const common::SubfieldFilters& subfieldFilters() const { + const velox::common::SubfieldFilters& subfieldFilters() const { return subfieldFilters_; } @@ -176,7 +176,7 @@ class HiveTableHandle : public ConnectorTableHandle { folly::dynamic serialize() const override; - static ConnectorTableHandlePtr create( + static connector::common::ConnectorTableHandlePtr create( const folly::dynamic& obj, void* context); @@ -185,7 +185,7 @@ class HiveTableHandle : public ConnectorTableHandle { private: const std::string tableName_; const bool filterPushdownEnabled_; - const common::SubfieldFilters subfieldFilters_; + const velox::common::SubfieldFilters subfieldFilters_; const core::TypedExprPtr remainingFilter_; const RowTypePtr dataColumns_; const std::unordered_map tableParameters_; diff --git a/velox/connectors/hive/iceberg/IcebergSplitReader.cpp b/velox/connectors/hive/iceberg/IcebergSplitReader.cpp index 84c85d3dce0b..5081f48bf2d3 100644 --- a/velox/connectors/hive/iceberg/IcebergSplitReader.cpp +++ b/velox/connectors/hive/iceberg/IcebergSplitReader.cpp @@ -29,14 +29,14 @@ IcebergSplitReader::IcebergSplitReader( const std::shared_ptr& hiveTableHandle, const std::unordered_map>* partitionKeys, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, const std::shared_ptr& hiveConfig, const RowTypePtr& readerOutputType, const std::shared_ptr& ioStats, const std::shared_ptr& fsStats, FileHandleFactory* const fileHandleFactory, folly::Executor* executor, - const std::shared_ptr& scanSpec) + const std::shared_ptr& scanSpec) : SplitReader( hiveSplit, hiveTableHandle, @@ -54,7 +54,7 @@ IcebergSplitReader::IcebergSplitReader( deleteBitmap_(nullptr) {} void IcebergSplitReader::prepareSplit( - std::shared_ptr metadataFilter, + std::shared_ptr metadataFilter, dwio::common::RuntimeStatistics& runtimeStats) { createReader(); if (emptySplit_) { diff --git a/velox/connectors/hive/iceberg/IcebergSplitReader.h b/velox/connectors/hive/iceberg/IcebergSplitReader.h index 795912159b96..aa96dcb185de 100644 --- a/velox/connectors/hive/iceberg/IcebergSplitReader.h +++ b/velox/connectors/hive/iceberg/IcebergSplitReader.h @@ -16,7 +16,7 @@ #pragma once -#include "velox/connectors/Connector.h" +#include "../../common/Connector.h" #include "velox/connectors/hive/SplitReader.h" #include "velox/connectors/hive/iceberg/PositionalDeleteFileReader.h" @@ -31,19 +31,19 @@ class IcebergSplitReader : public SplitReader { const std::shared_ptr& hiveTableHandle, const std::unordered_map>* partitionKeys, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, const std::shared_ptr& hiveConfig, const RowTypePtr& readerOutputType, const std::shared_ptr& ioStats, const std::shared_ptr& fsStats, FileHandleFactory* fileHandleFactory, folly::Executor* executor, - const std::shared_ptr& scanSpec); + const std::shared_ptr& scanSpec); ~IcebergSplitReader() override = default; void prepareSplit( - std::shared_ptr metadataFilter, + std::shared_ptr metadataFilter, dwio::common::RuntimeStatistics& runtimeStats) override; uint64_t next(uint64_t size, VectorPtr& output) override; diff --git a/velox/connectors/hive/iceberg/PositionalDeleteFileReader.cpp b/velox/connectors/hive/iceberg/PositionalDeleteFileReader.cpp index 8f0d7ad13750..d3e7edbd7abb 100644 --- a/velox/connectors/hive/iceberg/PositionalDeleteFileReader.cpp +++ b/velox/connectors/hive/iceberg/PositionalDeleteFileReader.cpp @@ -28,7 +28,7 @@ PositionalDeleteFileReader::PositionalDeleteFileReader( const IcebergDeleteFile& deleteFile, const std::string& baseFilePath, FileHandleFactory* fileHandleFactory, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, folly::Executor* executor, const std::shared_ptr& hiveConfig, const std::shared_ptr& ioStats, @@ -60,10 +60,10 @@ PositionalDeleteFileReader::PositionalDeleteFileReader( // this batch. If not, no need to proceed. // Create the ScanSpec for this delete file - auto scanSpec = std::make_shared(""); + auto scanSpec = std::make_shared(""); scanSpec->addField(posColumn_->name, 0); auto* pathSpec = scanSpec->getOrCreateChild(filePathColumn_->name); - pathSpec->setFilter(std::make_unique( + pathSpec->setFilter(std::make_unique( std::vector({baseFilePath_}), false)); // Create the file schema (in RowType) and split that will be used by readers diff --git a/velox/connectors/hive/iceberg/PositionalDeleteFileReader.h b/velox/connectors/hive/iceberg/PositionalDeleteFileReader.h index 211359d7fb93..185b9b61f06c 100644 --- a/velox/connectors/hive/iceberg/PositionalDeleteFileReader.h +++ b/velox/connectors/hive/iceberg/PositionalDeleteFileReader.h @@ -19,7 +19,7 @@ #include #include -#include "velox/connectors/Connector.h" +#include "../../common/Connector.h" #include "velox/connectors/hive/FileHandle.h" #include "velox/connectors/hive/HiveConfig.h" #include "velox/connectors/hive/HiveConnectorSplit.h" @@ -36,7 +36,7 @@ class PositionalDeleteFileReader { const IcebergDeleteFile& deleteFile, const std::string& baseFilePath, FileHandleFactory* fileHandleFactory, - const ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, folly::Executor* executor, const std::shared_ptr& hiveConfig, const std::shared_ptr& ioStats, @@ -65,7 +65,7 @@ class PositionalDeleteFileReader { const std::string& baseFilePath_; FileHandleFactory* const fileHandleFactory_; folly::Executor* const executor_; - const ConnectorQueryCtx* connectorQueryCtx_; + const connector::common::ConnectorQueryCtx* connectorQueryCtx_; const std::shared_ptr hiveConfig_; const std::shared_ptr ioStats_; const std::shared_ptr fsStats_; diff --git a/velox/connectors/hive/iceberg/tests/IcebergReadTest.cpp b/velox/connectors/hive/iceberg/tests/IcebergReadTest.cpp index a3c64f55748f..bd7b57f41e14 100644 --- a/velox/connectors/hive/iceberg/tests/IcebergReadTest.cpp +++ b/velox/connectors/hive/iceberg/tests/IcebergReadTest.cpp @@ -187,7 +187,7 @@ class HiveIcebergTest : public HiveConnectorTestBase { deleteFilePaths = writePositionDeleteFiles( deleteFilesForBaseDatafiles, dataFilePaths); - std::vector> splits; + std::vector> splits; for (const auto& dataFile : dataFilePaths) { std::string baseFileName = dataFile.first; @@ -240,7 +240,7 @@ class HiveIcebergTest : public HiveConnectorTestBase { std::shared_ptr config_; std::function()> flushPolicyFactory_; - std::vector> makeIcebergSplits( + std::vector> makeIcebergSplits( const std::string& dataFilePath, const std::vector& deleteFiles = {}, const std::unordered_map>& @@ -252,7 +252,7 @@ class HiveIcebergTest : public HiveConnectorTestBase { auto file = filesystems::getFileSystem(dataFilePath, nullptr) ->openFileForRead(dataFilePath); const int64_t fileSize = file->size(); - std::vector> splits; + std::vector> splits; const uint64_t splitSize = std::floor((fileSize) / splitCount); for (int i = 0; i < splitCount; ++i) { @@ -701,7 +701,7 @@ TEST_F(HiveIcebergTest, testPartitionedRead) { // Iceberg API sets partition values for dates to daysSinceEpoch, so // in velox, we do not need to convert it to days. // Test query on two partitions ds=17627(2018-04-06), ds=17628(2018-04-07) - std::vector> splits; + std::vector> splits; std::vector> dataFilePaths; for (int i = 0; i <= 1; ++i) { std::vector dataVectors; @@ -723,7 +723,7 @@ TEST_F(HiveIcebergTest, testPartitionedRead) { std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments; assignments.insert( {"c0", @@ -733,7 +733,7 @@ TEST_F(HiveIcebergTest, testPartitionedRead) { rowType->childAt(0), rowType->childAt(0))}); - std::vector requiredSubFields; + std::vector requiredSubFields; HiveColumnHandle::ColumnParseParameters columnParseParameters; columnParseParameters.partitionDateValueFormat = HiveColumnHandle::ColumnParseParameters::kDaysSinceEpoch; diff --git a/velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmark.cpp b/velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmark.cpp index bc9d2b4ad266..adbd4ec3e9fb 100644 --- a/velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmark.cpp +++ b/velox/connectors/hive/iceberg/tests/IcebergSplitReaderBenchmark.cpp @@ -308,13 +308,13 @@ void IcebergSplitReaderBenchmark::readSingleColumn( std::make_shared( std::unordered_map()); - std::unique_ptr connectorQueryCtx_ = - std::make_unique( + std::unique_ptr connectorQueryCtx_ = + std::make_unique( opPool.get(), connectorPool.get(), connectorSessionProperties_.get(), nullptr, - common::PrefixSortConfig(), + velox::common::PrefixSortConfig(), nullptr, nullptr, "query.IcebergSplitReader", diff --git a/velox/connectors/hive/storage_adapters/abfs/AbfsFileSystem.cpp b/velox/connectors/hive/storage_adapters/abfs/AbfsFileSystem.cpp index 18fbd3f2284a..ee699abaab26 100644 --- a/velox/connectors/hive/storage_adapters/abfs/AbfsFileSystem.cpp +++ b/velox/connectors/hive/storage_adapters/abfs/AbfsFileSystem.cpp @@ -97,7 +97,7 @@ class AbfsReadFile::Impl { } uint64_t preadv( - folly::Range regions, + folly::Range regions, folly::Range iobufs, File::IoStats* stats) const { size_t length = 0; @@ -186,7 +186,7 @@ uint64_t AbfsReadFile::preadv( } uint64_t AbfsReadFile::preadv( - folly::Range regions, + folly::Range regions, folly::Range iobufs, File::IoStats* stats) const { return impl_->preadv(regions, iobufs, stats); diff --git a/velox/connectors/hive/storage_adapters/abfs/AbfsReadFile.h b/velox/connectors/hive/storage_adapters/abfs/AbfsReadFile.h index 942439c06c1e..361ff40deb05 100644 --- a/velox/connectors/hive/storage_adapters/abfs/AbfsReadFile.h +++ b/velox/connectors/hive/storage_adapters/abfs/AbfsReadFile.h @@ -48,7 +48,7 @@ class AbfsReadFile final : public ReadFile { File::IoStats* stats = nullptr) const final; uint64_t preadv( - folly::Range regions, + folly::Range regions, folly::Range iobufs, File::IoStats* stats = nullptr) const final; diff --git a/velox/connectors/hive/storage_adapters/gcs/tests/GcsInsertTest.cpp b/velox/connectors/hive/storage_adapters/gcs/tests/GcsInsertTest.cpp index 3771dc95364f..fb324865c131 100644 --- a/velox/connectors/hive/storage_adapters/gcs/tests/GcsInsertTest.cpp +++ b/velox/connectors/hive/storage_adapters/gcs/tests/GcsInsertTest.cpp @@ -34,18 +34,18 @@ class GcsInsertTest : public testing::Test, public test::InsertTest { } void SetUp() override { - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); emulator_ = std::make_shared(); emulator_->bootstrap(); auto hiveConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector( exec::test::kHiveConnectorId, emulator_->hiveConfig(), ioExecutor_.get()); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); parquet::registerParquetReaderFactory(); parquet::registerParquetWriterFactory(); ioExecutor_ = std::make_unique(3); @@ -54,9 +54,9 @@ class GcsInsertTest : public testing::Test, public test::InsertTest { void TearDown() override { parquet::unregisterParquetReaderFactory(); parquet::unregisterParquetWriterFactory(); - connector::unregisterConnectorFactory( + connector::common::unregisterConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName); - connector::unregisterConnector(exec::test::kHiveConnectorId); + connector::common::unregisterConnector(exec::test::kHiveConnectorId); } std::shared_ptr emulator_; diff --git a/velox/connectors/hive/storage_adapters/s3fs/tests/S3InsertTest.cpp b/velox/connectors/hive/storage_adapters/s3fs/tests/S3InsertTest.cpp index 553e840c8094..1f748ff32a0f 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/tests/S3InsertTest.cpp +++ b/velox/connectors/hive/storage_adapters/s3fs/tests/S3InsertTest.cpp @@ -35,16 +35,16 @@ class S3InsertTest : public S3Test, public test::InsertTest { void SetUp() override { S3Test::SetUp(); filesystems::registerS3FileSystem(); - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); auto hiveConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector( ::exec::test::kHiveConnectorId, minioServer_->hiveConfig(), ioExecutor_.get()); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); parquet::registerParquetReaderFactory(); parquet::registerParquetWriterFactory(); } @@ -52,9 +52,9 @@ class S3InsertTest : public S3Test, public test::InsertTest { void TearDown() override { parquet::unregisterParquetReaderFactory(); parquet::unregisterParquetWriterFactory(); - connector::unregisterConnectorFactory( + connector::common::unregisterConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName); - connector::unregisterConnector(::exec::test::kHiveConnectorId); + connector::common::unregisterConnector(::exec::test::kHiveConnectorId); S3Test::TearDown(); filesystems::finalizeS3FileSystem(); } diff --git a/velox/connectors/hive/storage_adapters/s3fs/tests/S3MultipleEndpointsTest.cpp b/velox/connectors/hive/storage_adapters/s3fs/tests/S3MultipleEndpointsTest.cpp index cb6e8e783473..9f8ef8e5de55 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/tests/S3MultipleEndpointsTest.cpp +++ b/velox/connectors/hive/storage_adapters/s3fs/tests/S3MultipleEndpointsTest.cpp @@ -52,7 +52,7 @@ class S3MultipleEndpoints : public S3Test, public ::test::VectorTestBase { minioSecondServer_->addBucket(kBucketName.data()); filesystems::registerS3FileSystem(); - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); parquet::registerParquetReaderFactory(); parquet::registerParquetWriterFactory(); @@ -64,27 +64,27 @@ class S3MultipleEndpoints : public S3Test, public ::test::VectorTestBase { const std::unordered_map config1Override = {}, const std::unordered_map config2Override = {}) { auto hiveConnector1 = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector( std::string(connectorId1), minioServer_->hiveConfig(config1Override), ioExecutor_.get()); auto hiveConnector2 = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector( std::string(connectorId2), minioSecondServer_->hiveConfig(config2Override), ioExecutor_.get()); - connector::registerConnector(hiveConnector1); - connector::registerConnector(hiveConnector2); + connector::common::registerConnector(hiveConnector1); + connector::common::registerConnector(hiveConnector2); } void TearDown() override { parquet::unregisterParquetReaderFactory(); parquet::unregisterParquetWriterFactory(); - connector::unregisterConnectorFactory( + connector::common::unregisterConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName); S3Test::TearDown(); } @@ -198,8 +198,8 @@ TEST_F(S3MultipleEndpoints, baseEndpoints) { testJoin(kExpectedRows, outputDirectory, kConnectorId1, kConnectorId2); - connector::unregisterConnector(std::string(kConnectorId1)); - connector::unregisterConnector(std::string(kConnectorId2)); + connector::common::unregisterConnector(std::string(kConnectorId1)); + connector::common::unregisterConnector(std::string(kConnectorId2)); } TEST_F(S3MultipleEndpoints, bucketEndpoints) { @@ -225,8 +225,8 @@ TEST_F(S3MultipleEndpoints, bucketEndpoints) { testJoin(kExpectedRows, outputDirectory, kConnectorId1, kConnectorId2); - connector::unregisterConnector(std::string(kConnectorId1)); - connector::unregisterConnector(std::string(kConnectorId2)); + connector::common::unregisterConnector(std::string(kConnectorId1)); + connector::common::unregisterConnector(std::string(kConnectorId2)); } } // namespace facebook::velox diff --git a/velox/connectors/hive/storage_adapters/s3fs/tests/S3ReadTest.cpp b/velox/connectors/hive/storage_adapters/s3fs/tests/S3ReadTest.cpp index 7abcbfb7e56b..954a6e8f1477 100644 --- a/velox/connectors/hive/storage_adapters/s3fs/tests/S3ReadTest.cpp +++ b/velox/connectors/hive/storage_adapters/s3fs/tests/S3ReadTest.cpp @@ -39,22 +39,22 @@ class S3ReadTest : public S3Test, public ::test::VectorTestBase { void SetUp() override { S3Test::SetUp(); filesystems::registerS3FileSystem(); - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); auto hiveConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector(kHiveConnectorId, minioServer_->hiveConfig()); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); parquet::registerParquetReaderFactory(); } void TearDown() override { parquet::unregisterParquetReaderFactory(); filesystems::finalizeS3FileSystem(); - connector::unregisterConnectorFactory( + connector::common::unregisterConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName); - connector::unregisterConnector(kHiveConnectorId); + connector::common::unregisterConnector(kHiveConnectorId); S3Test::TearDown(); } }; diff --git a/velox/connectors/hive/tests/HiveConnectorSerDeTest.cpp b/velox/connectors/hive/tests/HiveConnectorSerDeTest.cpp index 5481c554d38d..01b398309757 100644 --- a/velox/connectors/hive/tests/HiveConnectorSerDeTest.cpp +++ b/velox/connectors/hive/tests/HiveConnectorSerDeTest.cpp @@ -15,7 +15,7 @@ */ #include -#include "velox/connectors/Connector.h" +#include "../../common/Connector.h" #include "velox/connectors/hive/HiveConnector.h" #include "velox/exec/tests/utils/HiveConnectorTestBase.h" #include "velox/expression/ExprToSubfieldFilter.h" @@ -29,7 +29,7 @@ class HiveConnectorSerDeTest : public exec::test::HiveConnectorTestBase { protected: HiveConnectorSerDeTest() { Type::registerSerDe(); - common::Filter::registerSerDe(); + velox::common::Filter::registerSerDe(); core::ITypedExpr::registerSerDe(); HiveTableHandle::registerSerDe(); HiveColumnHandle::registerSerDe(); @@ -123,7 +123,7 @@ TEST_F(HiveConnectorSerDeTest, hiveTableHandle) { ROW({"c0c0", "c1", "c2", "c3", "c4", "c5"}, {INTEGER(), BIGINT(), DOUBLE(), BOOLEAN(), BIGINT(), VARCHAR()}); auto tableHandle = makeTableHandle( - common::test::SubfieldFiltersBuilder() + velox::common::test::SubfieldFiltersBuilder() .add("c0.c0", isNotNull()) .add( "c1", @@ -226,7 +226,7 @@ TEST_F(HiveConnectorSerDeTest, hiveInsertTableHandle) { bucketProperty, locationHandle, dwio::common::FileFormat::NIMBLE, - common::CompressionKind::CompressionKind_SNAPPY, + velox::common::CompressionKind::CompressionKind_SNAPPY, serdeParameters); testSerde(*hiveInsertTableHandle); } diff --git a/velox/connectors/hive/tests/HiveConnectorTest.cpp b/velox/connectors/hive/tests/HiveConnectorTest.cpp index 4834cf3b8335..98254998fd75 100644 --- a/velox/connectors/hive/tests/HiveConnectorTest.cpp +++ b/velox/connectors/hive/tests/HiveConnectorTest.cpp @@ -51,12 +51,12 @@ std::vector makeSubfields(const std::vector& paths) { return subfields; } -folly::F14FastMap> +folly::F14FastMap> groupSubfields(const std::vector& subfields) { - folly::F14FastMap> grouped; + folly::F14FastMap> grouped; for (auto& subfield : subfields) { auto& name = - static_cast(*subfield.path()[0]) + static_cast(*subfield.path()[0]) .name(); grouped[name].push_back(&subfield); } diff --git a/velox/connectors/hive/tests/HiveConnectorUtilTest.cpp b/velox/connectors/hive/tests/HiveConnectorUtilTest.cpp index 82d39c68b495..376694f67918 100644 --- a/velox/connectors/hive/tests/HiveConnectorUtilTest.cpp +++ b/velox/connectors/hive/tests/HiveConnectorUtilTest.cpp @@ -47,12 +47,12 @@ class HiveConnectorUtilTest : public exec::test::HiveConnectorTestBase { TEST_F(HiveConnectorUtilTest, configureReaderOptions) { config::ConfigBase sessionProperties({}); - auto connectorQueryCtx = std::make_unique( + auto connectorQueryCtx = std::make_unique( pool_.get(), pool_.get(), &sessionProperties, nullptr, - common::PrefixSortConfig(), + velox::common::PrefixSortConfig(), nullptr, nullptr, "query.HiveConnectorUtilTest", @@ -79,7 +79,7 @@ TEST_F(HiveConnectorUtilTest, configureReaderOptions) { "testConnectorId", "testTable", false, - common::SubfieldFilters{}, + velox::common::SubfieldFilters{}, nullptr, nullptr, tableParameters); @@ -285,12 +285,12 @@ TEST_F(HiveConnectorUtilTest, cacheRetention) { std::make_shared(std::make_shared( std::unordered_map())); - auto connectorQueryCtx = std::make_unique( + auto connectorQueryCtx = std::make_unique( pool_.get(), pool_.get(), &sessionProperties, nullptr, - common::PrefixSortConfig(), + velox::common::PrefixSortConfig(), nullptr, nullptr, "query.HiveConnectorUtilTest", @@ -305,7 +305,7 @@ TEST_F(HiveConnectorUtilTest, cacheRetention) { "testConnectorId", "testTable", false, - common::SubfieldFilters{}, + velox::common::SubfieldFilters{}, nullptr, nullptr, std::unordered_map{}); @@ -341,11 +341,11 @@ TEST_F(HiveConnectorUtilTest, configureRowReaderOptions) { auto split = std::make_shared("", "", FileFormat::UNKNOWN); auto rowType = ROW({{"float_features", MAP(INTEGER(), REAL())}}); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addAllChildFields(*rowType); auto* float_features = spec->childByName("float_features"); - float_features->childByName(common::ScanSpec::kMapKeysFieldName) - ->setFilter(common::createBigintValues({1, 3}, false)); + float_features->childByName(velox::common::ScanSpec::kMapKeysFieldName) + ->setFilter(velox::common::createBigintValues({1, 3}, false)); float_features->setFlatMapFeatureSelection({"1", "3"}); } diff --git a/velox/connectors/hive/tests/HiveDataSinkTest.cpp b/velox/connectors/hive/tests/HiveDataSinkTest.cpp index 537db9e064cb..cc68fd660b02 100644 --- a/velox/connectors/hive/tests/HiveDataSinkTest.cpp +++ b/velox/connectors/hive/tests/HiveDataSinkTest.cpp @@ -126,12 +126,12 @@ class HiveDataSinkTest : public exec::test::HiveConnectorTestBase { connectorPool_ = root_->addAggregateChild("connector", exec::MemoryReclaimer::create()); - connectorQueryCtx_ = std::make_unique( + connectorQueryCtx_ = std::make_unique( opPool_.get(), connectorPool_.get(), connectorSessionProperties_.get(), nullptr, - common::PrefixSortConfig(), + velox::common::PrefixSortConfig(), nullptr, nullptr, "query.HiveDataSinkTest", @@ -156,16 +156,10 @@ class HiveDataSinkTest : public exec::test::HiveConnectorTestBase { outputRowType->names(), outputRowType->children(), partitionedBy, - bucketProperty, makeLocationHandle( outputDirectoryPath, std::nullopt, - connector::hive::LocationHandle::TableType::kNew), - fileFormat, - CompressionKind::CompressionKind_ZSTD, - {}, - writerOptions, - ensureFiles); + connector::common::LocationHandle::TableType::kNew),; } std::shared_ptr createDataSink( @@ -189,7 +183,7 @@ class HiveDataSinkTest : public exec::test::HiveConnectorTestBase { writerOptions, ensureFiles), connectorQueryCtx_.get(), - CommitStrategy::kNoCommit, + connector::common::CommitStrategy::kNoCommit, connectorConfig_); } @@ -206,7 +200,7 @@ class HiveDataSinkTest : public exec::test::HiveConnectorTestBase { void verifyWrittenData(const std::string& dirPath, int32_t numFiles = 1) { const std::vector filePaths = listFiles(dirPath); ASSERT_EQ(filePaths.size(), numFiles); - std::vector> splits; + std::vector> splits; std::for_each(filePaths.begin(), filePaths.end(), [&](auto filePath) { splits.push_back(makeHiveConnectorSplit(filePath)); }); @@ -217,7 +211,7 @@ class HiveDataSinkTest : public exec::test::HiveConnectorTestBase { } void setConnectorQueryContext( - std::unique_ptr connectorQueryCtx) { + std::unique_ptr connectorQueryCtx) { connectorQueryCtx_ = std::move(connectorQueryCtx); } @@ -232,7 +226,7 @@ class HiveDataSinkTest : public exec::test::HiveConnectorTestBase { std::make_shared( std::unordered_map(), /*mutable=*/true); - std::unique_ptr connectorQueryCtx_; + std::unique_ptr connectorQueryCtx_; std::shared_ptr connectorConfig_ = std::make_shared(std::make_shared( std::unordered_map())); @@ -740,12 +734,12 @@ DEBUG_ONLY_TEST_F(HiveDataSinkTest, memoryReclaim) { spillDirectory = exec::test::TempDirectoryPath::create(); spillConfig = getSpillConfig( spillDirectory->getPath(), testData.writerFlushThreshold); - auto connectorQueryCtx = std::make_unique( + auto connectorQueryCtx = std::make_unique( opPool_.get(), connectorPool_.get(), connectorSessionProperties_.get(), spillConfig.get(), - common::PrefixSortConfig(), + velox::common::PrefixSortConfig(), nullptr, nullptr, "query.HiveDataSinkTest", @@ -755,12 +749,12 @@ DEBUG_ONLY_TEST_F(HiveDataSinkTest, memoryReclaim) { ""); setConnectorQueryContext(std::move(connectorQueryCtx)); } else { - auto connectorQueryCtx = std::make_unique( + auto connectorQueryCtx = std::make_unique( opPool_.get(), connectorPool_.get(), connectorSessionProperties_.get(), nullptr, - common::PrefixSortConfig(), + velox::common::PrefixSortConfig(), nullptr, nullptr, "query.HiveDataSinkTest", @@ -881,12 +875,12 @@ TEST_F(HiveDataSinkTest, memoryReclaimAfterClose) { if (testData.writerSpillEnabled) { spillDirectory = exec::test::TempDirectoryPath::create(); spillConfig = getSpillConfig(spillDirectory->getPath(), 0); - auto connectorQueryCtx = std::make_unique( + auto connectorQueryCtx = std::make_unique( opPool_.get(), connectorPool_.get(), connectorSessionProperties_.get(), spillConfig.get(), - common::PrefixSortConfig(), + velox::common::PrefixSortConfig(), nullptr, nullptr, "query.HiveDataSinkTest", @@ -896,12 +890,12 @@ TEST_F(HiveDataSinkTest, memoryReclaimAfterClose) { ""); setConnectorQueryContext(std::move(connectorQueryCtx)); } else { - auto connectorQueryCtx = std::make_unique( + auto connectorQueryCtx = std::make_unique( opPool_.get(), connectorPool_.get(), connectorSessionProperties_.get(), nullptr, - common::PrefixSortConfig(), + velox::common::PrefixSortConfig(), nullptr, nullptr, "query.HiveDataSinkTest", @@ -1022,12 +1016,12 @@ TEST_F(HiveDataSinkTest, sortWriterMemoryReclaimDuringFinish) { HiveConfig::kSortWriterFinishTimeSliceLimitMsSession, "1"); connectorSessionProperties_->set( HiveConfig::kSortWriterMaxOutputRowsSession, "100"); - auto connectorQueryCtx = std::make_unique( + auto connectorQueryCtx = std::make_unique( opPool_.get(), connectorPool_.get(), connectorSessionProperties_.get(), spillConfig.get(), - common::PrefixSortConfig(), + velox::common::PrefixSortConfig(), nullptr, nullptr, "query.HiveDataSinkTest", @@ -1087,12 +1081,12 @@ DEBUG_ONLY_TEST_F(HiveDataSinkTest, sortWriterFailureTest) { getSpillConfig(spillDirectory->getPath(), 0); // Triggers the memory reservation in sort buffer. spillConfig->minSpillableReservationPct = 1'000; - auto connectorQueryCtx = std::make_unique( + auto connectorQueryCtx = std::make_unique( opPool_.get(), connectorPool_.get(), connectorSessionProperties_.get(), spillConfig.get(), - common::PrefixSortConfig(), + velox::common::PrefixSortConfig(), nullptr, nullptr, "query.HiveDataSinkTest", @@ -1277,41 +1271,22 @@ TEST_F(HiveDataSinkTest, ensureFilesUnsupported) { makeHiveInsertTableHandle( rowType_->names(), rowType_->children(), - {rowType_->names()[0]}, // partitionedBy - nullptr, // bucketProperty - makeLocationHandle( + {rowType_->names()[0]}, // bucketProperty +makeLocationHandle( "/path/to/test", std::nullopt, - connector::hive::LocationHandle::TableType::kNew), - dwio::common::FileFormat::DWRF, - CompressionKind::CompressionKind_ZSTD, - {}, // serdeParameters - nullptr, // writeOptions - true // ensureFiles - ), + connector::common::LocationHandle::TableType::kNew),, "ensureFiles is not supported with partition keys in the data"); VELOX_ASSERT_THROW( makeHiveInsertTableHandle( rowType_->names(), rowType_->children(), - {}, // partitionedBy - {std::make_shared( - HiveBucketProperty::Kind::kPrestoNative, - 1, - std::vector{rowType_->names()[0]}, - std::vector{rowType_->children()[0]}, - std::vector>{})}, + {}, makeLocationHandle( "/path/to/test", std::nullopt, - connector::hive::LocationHandle::TableType::kNew), - dwio::common::FileFormat::DWRF, - CompressionKind::CompressionKind_ZSTD, - {}, // serdeParameters - nullptr, // writeOptions - true // ensureFiles - ), + connector::common::LocationHandle::TableType::kNew),, "ensureFiles is not supported with bucketing"); } } // namespace diff --git a/velox/connectors/iceberg/IcebergObjectFactory.cpp b/velox/connectors/iceberg/IcebergObjectFactory.cpp new file mode 100644 index 000000000000..049ef2b61b73 --- /dev/null +++ b/velox/connectors/iceberg/IcebergObjectFactory.cpp @@ -0,0 +1,62 @@ +// +// Created by Ying Su on 6/20/25. +// + +#include "IcebergObjectFactory.h" + +//=== FILE: velox/connectors/iceberg/IcebergObjectFactory.cpp === + +#include "velox/connectors/iceberg/IcebergObjectFactory.h" +#include "velox/connectors/iceberg/IcebergInsertTableHandle.h" +#include "velox/connectors/iceberg/IcebergLocationHandle.h" +#include "dwio/common/WriterOptions.h" +#include "velox/common/base/Exceptions.h" + +using namespace facebook::velox::connector::iceberg; +using facebook::velox::connector::common::ConnectorLocationHandle; +using facebook::velox::connector::common::LocationHandlePtr; + +std::shared_ptr +IcebergObjectFactory::makeInsertTableHandle( + const std::string& connectorId, + std::vector> inputColumns, + std::shared_ptr locationHandle, + const folly::dynamic& options) const { + // 1) Cast locationHandle to IcebergLocationHandle + auto icebergLoc = std::dynamic_pointer_cast( + locationHandle); + VELOX_CHECK( + icebergLoc, + "Expected IcebergLocationHandle in IcebergObjectFactory::makeInsertTableHandle"); + + // 2) Catalog, namespace, table + auto catalog = options["catalog"].asString(); + std::vector ns; + for (auto& v : options["namespace"]) { + ns.push_back(v.asString()); + } + auto tableName = options["tableName"].asString(); + + // 3) Snapshot & PartitionSpec IDs + int64_t snapshotId = options.getDefault("snapshotId", 0).asInt(); + int32_t specId = options.getDefault("partitionSpecId", 0).asInt(); + + // 4) WriterOptions (optional) + std::shared_ptr writerOptions = nullptr; + if (auto p = options.get_ptr("writerOptions")) { + writerOptions = + dwio::common::WriterOptions::fromDynamic(*p); + } + + // 5) Construct and return + return std::make_shared( + std::move(inputColumns), + std::move(icebergLoc), + connectorId, // pass along if Iceberg needs it + std::move(catalog), + std::move(ns), + std::move(tableName), + snapshotId, + specId, + std::move(writerOptions)); +} diff --git a/velox/connectors/iceberg/IcebergObjectFactory.h b/velox/connectors/iceberg/IcebergObjectFactory.h new file mode 100644 index 000000000000..0c1c082d5c67 --- /dev/null +++ b/velox/connectors/iceberg/IcebergObjectFactory.h @@ -0,0 +1,16 @@ +// +// Created by Ying Su on 6/20/25. +// + +#pragma once + +#include "velox/connectors/common/ConnectorObjectFactory.h" + +class IcebergObjectFactory : public common::ConnectorObjectFactory { + std::shared_ptr + IcebergObjectFactory::makeInsertTableHandle( + const std::string& connectorId, + std::vector> inputColumns, + std::shared_ptr locationHandle, + const folly::dynamic& options) const +}; diff --git a/velox/connectors/iceberg/IcebergTableHandle.cpp b/velox/connectors/iceberg/IcebergTableHandle.cpp new file mode 100644 index 000000000000..095a37201904 --- /dev/null +++ b/velox/connectors/iceberg/IcebergTableHandle.cpp @@ -0,0 +1,5 @@ +// +// Created by Ying Su on 6/20/25. +// + +#include "IcebergTableHandle.h" diff --git a/velox/connectors/iceberg/IcebergTableHandle.h b/velox/connectors/iceberg/IcebergTableHandle.h new file mode 100644 index 000000000000..80666b5991b7 --- /dev/null +++ b/velox/connectors/iceberg/IcebergTableHandle.h @@ -0,0 +1,10 @@ +// +// Created by Ying Su on 6/20/25. +// + +#ifndef VELOX_ICEBERGTABLEHANDLE_H +#define VELOX_ICEBERGTABLEHANDLE_H + +class IcebergTableHandle {}; + +#endif // VELOX_ICEBERGTABLEHANDLE_H diff --git a/velox/connectors/tests/ConnectorTest.cpp b/velox/connectors/tests/ConnectorTest.cpp index cb40cb56219b..c472167defce 100644 --- a/velox/connectors/tests/ConnectorTest.cpp +++ b/velox/connectors/tests/ConnectorTest.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "velox/connectors/Connector.h" +#include "../common/Connector.h" #include "velox/common/base/tests/GTestUtils.h" #include "velox/common/config/Config.h" @@ -22,42 +22,42 @@ namespace facebook::velox::connector { -class ConnectorTest : public testing::Test {}; +class connector::common::ConnectorTest : public testing::Test {}; namespace { -class TestConnector : public connector::Connector { +class TestConnector : public connector::common::Connector { public: - TestConnector(const std::string& id) : connector::Connector(id) {} + TestConnector(const std::string& id) : connector::common::Connector(id) {} - std::unique_ptr createDataSource( + std::unique_ptr createDataSource( const RowTypePtr& /* outputType */, - const std::shared_ptr& /* tableHandle */, + const std::shared_ptr& /* tableHandle */, const std::unordered_map< std::string, std::shared_ptr< - connector::ConnectorColumnHandle>>& /* columnHandles */, - connector::ConnectorQueryCtx* connectorQueryCtx) override { + connector::common::ConnectorColumnHandle>>& /* columnHandles */, + connector::common::ConnectorQueryCtx* connectorQueryCtx) override { VELOX_NYI(); } - std::unique_ptr createDataSink( + std::unique_ptr createDataSink( RowTypePtr /*inputType*/, std::shared_ptr< - ConnectorInsertTableHandle> /*connectorInsertTableHandle*/, - ConnectorQueryCtx* /*connectorQueryCtx*/, + connector::common::ConnectorInsertTableHandle> /*connectorInsertTableHandle*/, + connector::common::ConnectorQueryCtx* /*connectorQueryCtx*/, CommitStrategy /*commitStrategy*/) override final { VELOX_NYI(); } }; -class TestConnectorFactory : public connector::ConnectorFactory { +class TestConnectorFactory : public connector::common::ConnectorFactory { public: static constexpr const char* kConnectorFactoryName = "test-factory"; - TestConnectorFactory() : ConnectorFactory(kConnectorFactoryName) {} + TestConnectorFactory() : connector::common::ConnectorFactory(kConnectorFactoryName) {} - std::shared_ptr newConnector( + std::shared_ptr newConnector( const std::string& id, std::shared_ptr /*config*/, folly::Executor* /*ioExecutor*/ = nullptr, @@ -100,7 +100,7 @@ TEST_F(ConnectorTest, getAllConnectors) { TEST_F(ConnectorTest, connectorSplit) { { - const ConnectorSplit split("test", 100, true); + const connector::common::ConnectorSplit split("test", 100, true); ASSERT_EQ(split.connectorId, "test"); ASSERT_EQ(split.splitWeight, 100); ASSERT_EQ(split.cacheable, true); @@ -109,7 +109,7 @@ TEST_F(ConnectorTest, connectorSplit) { "[split: connector id test, weight 100, cacheable true]"); } { - const ConnectorSplit split("test", 50, false); + const connector::common::ConnectorSplit split("test", 50, false); ASSERT_EQ(split.connectorId, "test"); ASSERT_EQ(split.splitWeight, 50); ASSERT_EQ(split.cacheable, false); diff --git a/velox/connectors/tpch/TpchConnector.cpp b/velox/connectors/tpch/TpchConnector.cpp index 3b4dcf7a8323..e430db9b993a 100644 --- a/velox/connectors/tpch/TpchConnector.cpp +++ b/velox/connectors/tpch/TpchConnector.cpp @@ -59,10 +59,10 @@ std::string TpchTableHandle::toString() const { TpchDataSource::TpchDataSource( const std::shared_ptr& outputType, - const std::shared_ptr& tableHandle, + const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& columnHandles, velox::memory::MemoryPool* pool) : pool_(pool) { auto tpchTableHandle = @@ -121,7 +121,7 @@ RowVectorPtr TpchDataSource::projectOutputColumns(RowVectorPtr inputVector) { std::move(children)); } -void TpchDataSource::addSplit(std::shared_ptr split) { +void TpchDataSource::addSplit(std::shared_ptr split) { VELOX_CHECK_EQ( currentSplit_, nullptr, diff --git a/velox/connectors/tpch/TpchConnector.h b/velox/connectors/tpch/TpchConnector.h index 992a96ef3500..0a41784a5e69 100644 --- a/velox/connectors/tpch/TpchConnector.h +++ b/velox/connectors/tpch/TpchConnector.h @@ -16,7 +16,7 @@ #pragma once #include "velox/common/config/Config.h" -#include "velox/connectors/Connector.h" +#include "velox/connectors/common/Connector.h" #include "velox/connectors/tpch/TpchConnectorSplit.h" #include "velox/tpch/gen/TpchGen.h" @@ -26,7 +26,7 @@ class TpchConnector; // TPC-H column handle only needs the column name (all columns are generated in // the same way). -class TpchColumnHandle : public ConnectorColumnHandle { +class TpchColumnHandle : public connector::common::ConnectorColumnHandle { public: explicit TpchColumnHandle(const std::string& name) : name_(name) {} @@ -39,13 +39,13 @@ class TpchColumnHandle : public ConnectorColumnHandle { }; // TPC-H table handle uses the underlying enum to describe the target table. -class TpchTableHandle : public ConnectorTableHandle { +class TpchTableHandle : public connector::common::ConnectorTableHandle { public: explicit TpchTableHandle( std::string connectorId, velox::tpch::Table table, double scaleFactor = 1.0) - : ConnectorTableHandle(std::move(connectorId)), + : connector::common::ConnectorTableHandle(std::move(connectorId)), table_(table), scaleFactor_(scaleFactor) { VELOX_CHECK_GE(scaleFactor, 0, "Tpch scale factor must be non-negative"); @@ -68,21 +68,21 @@ class TpchTableHandle : public ConnectorTableHandle { double scaleFactor_; }; -class TpchDataSource : public DataSource { +class TpchDataSource : public connector::common::DataSource { public: TpchDataSource( const std::shared_ptr& outputType, - const std::shared_ptr& tableHandle, + const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, + std::shared_ptr>& columnHandles, velox::memory::MemoryPool* pool); - void addSplit(std::shared_ptr split) override; + void addSplit(std::shared_ptr split) override; void addDynamicFilter( column_index_t /*outputChannel*/, - const std::shared_ptr& /*filter*/) override { + const std::shared_ptr& /*filter*/) override { VELOX_NYI("Dynamic filters not supported by TpchConnector."); } @@ -129,21 +129,21 @@ class TpchDataSource : public DataSource { memory::MemoryPool* pool_; }; -class TpchConnector final : public Connector { +class TpchConnector final : public connector::common::Connector { public: TpchConnector( const std::string& id, std::shared_ptr config, folly::Executor* /*executor*/) - : Connector(id) {} + : connector::common::Connector(id) {} - std::unique_ptr createDataSource( + std::unique_ptr createDataSource( const std::shared_ptr& outputType, - const std::shared_ptr& tableHandle, + const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, - ConnectorQueryCtx* connectorQueryCtx) override final { + std::shared_ptr>& columnHandles, + connector::common::ConnectorQueryCtx* connectorQueryCtx) override final { return std::make_unique( outputType, tableHandle, @@ -151,26 +151,26 @@ class TpchConnector final : public Connector { connectorQueryCtx->memoryPool()); } - std::unique_ptr createDataSink( + std::unique_ptr createDataSink( RowTypePtr /*inputType*/, std::shared_ptr< - ConnectorInsertTableHandle> /*connectorInsertTableHandle*/, - ConnectorQueryCtx* /*connectorQueryCtx*/, - CommitStrategy /*commitStrategy*/) override final { + connector::common::ConnectorInsertTableHandle> /*connectorInsertTableHandle*/, + connector::common::ConnectorQueryCtx* /*connectorQueryCtx*/, + connector::common::CommitStrategy /*commitStrategy*/) override final { VELOX_NYI("TpchConnector does not support data sink."); } }; -class TpchConnectorFactory : public ConnectorFactory { +class TpchConnectorFactory : public connector::common::ConnectorFactory { public: static constexpr const char* kTpchConnectorName{"tpch"}; - TpchConnectorFactory() : ConnectorFactory(kTpchConnectorName) {} + TpchConnectorFactory() : connector::common::ConnectorFactory(kTpchConnectorName) {} explicit TpchConnectorFactory(const char* connectorName) - : ConnectorFactory(connectorName) {} + : connector::common::ConnectorFactory(connectorName) {} - std::shared_ptr newConnector( + std::shared_ptr newConnector( const std::string& id, std::shared_ptr config, folly::Executor* ioExecutor = nullptr, diff --git a/velox/connectors/tpch/TpchConnectorObjectFactory.cpp b/velox/connectors/tpch/TpchConnectorObjectFactory.cpp new file mode 100644 index 000000000000..368377686b11 --- /dev/null +++ b/velox/connectors/tpch/TpchConnectorObjectFactory.cpp @@ -0,0 +1,341 @@ + +#include "velox/connectors/hive/TpchObjehiveColumnTypeFahiveColumnTypeory.h" + +#include "velox/connectors/common/ConnehiveColumnTypeorNames.h" +#include "velox/connectors/common/ConnehiveColumnTypeorObjehiveColumnTypeFahiveColumnTypeory.h" +#include "velox/connectors/hive/TpchTpchConnectorSplitBuilder.h" +#include "velox/connectors/hive/LocationHandle.h" +#include "velox/connectors/hive/TableHandle.h" // TpchTableHandle +#include "velox/core/Expressions.h" +#include "velox/type/SubfieldFilters.h" + +namespace facebook::velox::connector::tpch { + +class TpchObjectFactory + : public connector::common::ConnectorObjectFactory { + public: + std::shared_ptr + TpchObjectFactory::makeTpchConnectorSplit( + const std::string& filePath, + uint64_t start, + uint64_t length, + const folly::dynamic& options = {}) const override { + return exec::Split(std::make_shared( + kTpchConnectorId, /*cacheable=*/true, totalParts, partNumber)); + + auto builder = + TpchConnectorSplitBuilder(filePath).start(start).length( + length); + + if (options.count("fileFormat")) { + builder.fileFormat( + static_cast(options["fileFormat"].asInt())); + } + + if (options.count("splitWeight")) { + builder.splitWeight(options["splitWeight"].asInt()); + } + + if (options.count("cacheable")) { + builder.cacheable(options["cacheable"].asBool()); + } + + if (options.count("infoColumns")) { + for (auto& kv : options["infoColumns"].items()) { + builder.infoColumn(kv.first.asString(), kv.second.asString()); + } + } + + if (options.count("partitionKeys")) { + for (auto& kv : options["partitionKeys"].items()) { + builder.partitionKey( + kv.first.asString(), + kv.second.isNull() + ? std::nullopt + : std::optional(kv.second.asString())); + } + } + + if (options.count("tableBucketNumber")) { + builder.tableBucketNumber(options["tableBucketNumber"].asInt()); + } + + if (options.count("bucketConversion")) { + TpchBucketConversion bucketConversion; + const auto& bucketConversionOption = options["bucketConversion"]; + bucketConversion.tableBucketCount = + bucketConversionOption["tableBucketCount"].asInt(); + bucketConversion.partitionBucketCount = + bucketConversionOption["partitionBucketCount"].asInt(); + for (auto& bucketColumnHandlesOption : + bucketConversionOption["bucketColumnHandles"]) { + bucketConversion.bucketColumnHandles.push_back( + std::const_pointer_cast( + deserialize(bucketColumnHandlesOption))); + } + builder.bucketConversion(bucketConversion); + } + + if (options.count("customSplitInfo")) { + std::unordered_map info; + for (auto& kv : options["customSplitInfo"].items()) { + info[kv.first.asString()] = kv.second.asString(); + } + builder.customSplitInfo(info); + } + + if (options.count("extraFileInfo")) { + auto extra = options["extraFileInfo"].isNull() + ? std::shared_ptr() + : std::make_shared(options["extraFileInfo"].asString()); + builder.extraFileInfo(extra); + } + + if (options.count("serdeParameters")) { + std::unordered_map serde; + for (auto& kv : options["serdeParameters"].items()) { + serde[kv.first.asString()] = kv.second.asString(); + } + builder.serdeParameters(serde); + } + + if (options.count("storageParameters")) { + std::unordered_map storage; + for (auto& kv : options["storageParameters"].items()) { + storage[kv.first.asString()] = kv.second.asString(); + } + builder.storageParameters(storage); + } + + if (options.count("properties")) { + FileProperties props; + const auto& propertiesOption = options["properties"]; + if (propertiesOption.count("fileSize") && + !propertiesOption["fileSize"].isNull()) { + props.fileSize = propertiesOption["fileSize"].asInt(); + } + if (propertiesOption.count("modificationTime") && + !propertiesOption["modificationTime"].isNull()) { + props.modificationTime = propertiesOption["modificationTime"].asInt(); + } + builder.fileProperties(props); + } + + if (options.count("rowIdProperties")) { + RowIdProperties rowIdProperties; + const auto& rowIdPropertiesOption = options["rowIdProperties"]; + rowIdProperties.metadataVersion = + rowIdPropertiesOption["metadataVersion"].asInt(); + rowIdProperties.partitionId = + rowIdPropertiesOption["partitionId"].asInt(); + rowIdProperties.tableGuid = rowIdPropertiesOption["tableGuid"].asString(); + builder.rowIdProperties(rowIdProperties); + } + + return builder.build(); + } + + std::shared_ptr + TpchObjehiveColumnTypeFahiveColumnTypeory::makeTableHandle( + const std::string& tableName, + const RowTypePtr& dataColumns = nullptr, + const folly::dynamic& options = {}) const override { + bool pushdown = + options.getDefault("filterowIdPropertiesushdownEnabled", true).asBool(); + auto subfields = options.count("subfieldFilters") + ? SubfieldFilters::fromDynamic(options["subfieldFilters"]) + : SubfieldFilters{}; + auto remaining = options.count("remainingFilter") + ? deserializeTypedExpr(options["remainingFilter"]) + : core::TypedExprowIdPropertiestr{}; + + std::unordered_map tableParams; + if (options.count("tableParameters")) { + for (auto& kv : options["tableParameters"].items()) { + tableParams[kv.first.asString()] = kv.second.asString(); + } + } + + return std::make_shared( + kTpchConnehiveColumnTypeorName, + tableName, + pushdown, + std::move(subfields), + remaining, + dataColumns, + tableParams); + } + + std::shared_ptr + TpchObjehiveColumnTypeFahiveColumnTypeory::makeInsertTableHandle( + const std::vehiveColumnTypeor& colNames, + const std::vehiveColumnTypeor& colTypes, + std::shared_ptr locHandle, + const std::optional codec, + const folly::dynamic& options = {}) const override { + // Pack connector-specific options into a dynamic map + folly::dynamic options = folly::dynamic::object( + "partitionedBy", folly::dynamic::array())( + "serdeParameters", folly::dynamic::object())( + "fileFormat", static_cast(tableStorageFormat))( + "ensureFiles", ensureFiles); + + // Add partition columns + for (const auto& col : partitionedBy) { + options["partitionedBy"].push_back(col); + } + + // Add serde parameters + for (auto& kv : serdeParameters) { + options["serdeParameters"][kv.first] = kv.second; + } + + // If writerOptions is non-null, pass it through + if (writerOptions) { + options["writerOptions"] = writerOptions; + } + + // Delegate to the common fahiveColumnTypeory + return fahiveColumnTypeory_->makeInsertTableHandle( + tableColumnNames, + tableColumnTypes, + std::move(locationHandle), + compressionKind, + options); + } + b.start(start).length(length); + if (options.count("splitWeight")) { + b.splitWeight(options["splitWeight"].asInt()); + } + if (options.count("cacheable")) { + b.cacheable(options["cacheable"].asBool()); + } + if (options.count("infoColumns")) { + for (auto& kv : options["infoColumns"].items()) { + b.infoColumn(kv.first.asString(), kv.second.asString()); + } + } + if (options.count("partitionKeys")) { + for (auto& kv : options["partitionKeys"].items()) { + b.partitionKey( + kv.first.asString(), + kv.second.isNull() + ? std::nullopt + : std::optional(kv.second.asString())); + } + } + if (options.count("tableBucketNumber")) { + b.tableBucketNumber(options["tableBucketNumber"].asInt()); + } + if (options.count("bucketConversion")) { + const auto& bcDyn = options["bucketConversion"]; + TpchBucketConversion bc; + bc.tableBucketCount = bcDyn["tableBucketCount"].asInt(); + bc.partitionBucketCount = bcDyn["partitionBucketCount"].asInt(); + for (auto& hDyn : bcDyn["bucketColumnHandles"]) { + bc.bucketColumnHandles.push_back( + std::const_pointer_cast( + facebook::velox::ISerializable::deserialize( + hDyn))); + } + b.bucketConversion(bc); + } + if (options.count("customSplitInfo")) { + std::unordered_map info; + for (auto& kv : options["customSplitInfo"].items()) { + info[kv.first.asString()] = kv.second.asString(); + } + b.customSplitInfo(info); + } + if (options.count("extraFileInfo")) { + auto extra = options["extraFileInfo"].isNull() + ? std::shared_ptr() + : std::make_shared(options["extraFileInfo"].asString()); + b.extraFileInfo(extra); + } + if (options.count("serdeParameters")) { + std::unordered_map serde; + for (auto& kv : options["serdeParameters"].items()) { + serde[kv.first.asString()] = kv.second.asString(); + } + b.serdeParameters(serde); + } + if (options.count("storageParameters")) { + std::unordered_map storage; + for (auto& kv : options["storageParameters"].items()) { + storage[kv.first.asString()] = kv.second.asString(); + } + b.storageParameters(storage); + } + if (options.count("properties")) { + FileProperties props; + const auto& pDyn = options["properties"]; + if (pDyn.count("fileSize") && !pDyn["fileSize"].isNull()) { + props.fileSize = pDyn["fileSize"].asInt(); + } + if (pDyn.count("modificationTime") && !pDyn["modificationTime"].isNull()) { + props.modificationTime = pDyn["modificationTime"].asInt(); + } + b.fileProperties(props); + } + if (options.count("rowIdProperties")) { + RowIdProperties rp; + const auto& rDyn = options["rowIdProperties"]; + rp.metadataVersion = rDyn["metadataVersion"].asInt(); + rp.partitionId = rDyn["partitionId"].asInt(); + rp.tableGuid = rDyn["tableGuid"].asString(); + b.rowIdProperties(rp); + } + return b.build(); +} +} + +std::unique_ptr +TpchObjehiveColumnTypeFahiveColumnTypeory::makeColumnHandle( + const std::string& name, + const TypePtr& dataType, + const folly::dynamic& options) const override { + using TpchColumnType = hive::TpchColumnHandle::ColumnType; + TpchColumnType hiveColumnType = TpchColumnType::kRegular; + if (options.count("columnType")) { + auto str = options.getDefault("columnType", "regular").asString(); + + if (str == "partition_key") { + hiveColumnType = TpchColumnType::kPartitionKey; + } else if (str == "synthesized") { + hiveColumnType = TpchColumnType::kSynthesized; + } else if (str == "row_index") { + hiveColumnType = TpchColumnType::kRowIndex; + } else if (str == "row_id") { + hiveColumnType = TpchColumnType::kRowId; + } + } + + TypePtr hiveType = options["hiveType"] + + std::vector subfields; + if (options.count("requiredSubfields")) { + for (auto& v : options["requiredSubfields"]) { + subfields.push_back(v.asString()); + } + } + + return std::make_unique( + name, columnType, dataType, hiveType, std::move(subfields)); +} + +std::shared_ptr +TpchObjehiveColumnTypeFahiveColumnTypeory::makeLocationHandle( + std::string targetDirectory, + std::optional writeDirectory = std::nullopt, + LocationHandle::TableType tableType = + LocationHandle::TableType::kNew) const override { + return std::make_shared( + std::move(targetDirectory), + writeDirectory.value_or(targetDirectory), + tableType); +} +} +; + +} // namespace facebook::velox::connector::hive diff --git a/velox/connectors/tpch/TpchConnectorSplit.h b/velox/connectors/tpch/TpchConnectorSplit.h index bfb112db5619..f476ad5bfd8a 100644 --- a/velox/connectors/tpch/TpchConnectorSplit.h +++ b/velox/connectors/tpch/TpchConnectorSplit.h @@ -16,11 +16,11 @@ #pragma once #include -#include "velox/connectors/Connector.h" +#include "../common/Connector.h" namespace facebook::velox::connector::tpch { -struct TpchConnectorSplit : public connector::ConnectorSplit { +struct TpchConnectorSplit : public connector::common::ConnectorSplit { explicit TpchConnectorSplit( const std::string& connectorId, size_t totalParts, @@ -32,7 +32,7 @@ struct TpchConnectorSplit : public connector::ConnectorSplit { bool cacheable, size_t totalParts, size_t partNumber) - : ConnectorSplit(connectorId, /*splitWeight=*/0, cacheable), + : connector::common::ConnectorSplit(connectorId, /*splitWeight=*/0, cacheable), totalParts(totalParts), partNumber(partNumber) { VELOX_CHECK_GE(totalParts, 1, "totalParts must be >= 1"); diff --git a/velox/connectors/tpch/TpchPlugin.cpp b/velox/connectors/tpch/TpchPlugin.cpp new file mode 100644 index 000000000000..8e01833e9051 --- /dev/null +++ b/velox/connectors/tpch/TpchPlugin.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/common/Connector.h" // for connectorObjectFactories() +#include "velox/connectors/common/ConnectorNames.h" // for kHiveConnectorName +#include "velox/connectors/tpch/TpchConnector.h" + +extern "C" void registerConnectorPlugin() { + using namespace facebook::velox::connector::common; + using namespace facebook::velox::connector::tpch; + + connectorFactories().emplace( + kTpchConnectorName, std::make_shared()); + + connectorObjectFactories().emplace( + kTpchConnectorName, std::make_unique()); +} + +// Force registration even if someone links this .so directly +static bool _hivePluginRegistered = []() { + registerConnectorPlugin(); + return true; +}(); diff --git a/velox/connectors/tpch/tests/SpeedTest.cpp b/velox/connectors/tpch/tests/SpeedTest.cpp index 0e8216260a90..10dfc62de6b1 100644 --- a/velox/connectors/tpch/tests/SpeedTest.cpp +++ b/velox/connectors/tpch/tests/SpeedTest.cpp @@ -27,7 +27,7 @@ namespace { /// This a utility binary that helps measure and evaluate how fast we can -/// generate TPC-H datasets using the TPC-H Connector. You can control the +/// generate TPC-H datasets using the TPC-H connector::common::Connector. You can control the /// generated table, scale factor, number of splits, and number of threads /// (drivers) using the flags defined below. @@ -56,21 +56,21 @@ using std::chrono::system_clock; class TpchSpeedTest { public: TpchSpeedTest() { - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); auto tpchConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::tpch::TpchConnectorFactory::kTpchConnectorName) ->newConnector( kTpchConnectorId_, std::make_shared( std::unordered_map())); - connector::registerConnector(tpchConnector); + connector::common::registerConnector(tpchConnector); } ~TpchSpeedTest() { - connector::unregisterConnector(kTpchConnectorId_); - connector::unregisterConnectorFactory( + connector::common::unregisterConnector(kTpchConnectorId_); + connector::common::unregisterConnectorFactory( connector::tpch::TpchConnectorFactory::kTpchConnectorName); } diff --git a/velox/connectors/tpch/tests/TpchConnectorTest.cpp b/velox/connectors/tpch/tests/TpchConnectorTest.cpp index 65d0a1e09bce..d1e28249fedd 100644 --- a/velox/connectors/tpch/tests/TpchConnectorTest.cpp +++ b/velox/connectors/tpch/tests/TpchConnectorTest.cpp @@ -39,21 +39,21 @@ class TpchConnectorTest : public exec::test::OperatorTestBase { void SetUp() override { FLAGS_velox_tpch_text_pool_size_mb = 10; OperatorTestBase::SetUp(); - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); auto tpchConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::tpch::TpchConnectorFactory::kTpchConnectorName) ->newConnector( kTpchConnectorId, std::make_shared( std::unordered_map())); - connector::registerConnector(tpchConnector); + connector::common::registerConnector(tpchConnector); } void TearDown() override { - connector::unregisterConnector(kTpchConnectorId); - connector::unregisterConnectorFactory( + connector::common::unregisterConnector(kTpchConnectorId); + connector::common::unregisterConnectorFactory( connector::tpch::TpchConnectorFactory::kTpchConnectorName); OperatorTestBase::TearDown(); } diff --git a/velox/core/ExpressionEvaluator.h b/velox/core/ExpressionEvaluator.h index b10b113c8b25..36e165cdef39 100644 --- a/velox/core/ExpressionEvaluator.h +++ b/velox/core/ExpressionEvaluator.h @@ -26,7 +26,7 @@ namespace facebook::velox::core { class ITypedExpr; // Exposes expression evaluation functionality of the engine to other parts of -// the code base. Connector may use it, for example, to evaluate pushed down +// the code base. connector::common::Connector may use it, for example, to evaluate pushed down // filters. This is not thread safe and serializing operations is the // responsibility of the caller. This is self-contained and does not reference // objects from the thread which constructs this. Passing this between threads diff --git a/velox/core/PlanNode.cpp b/velox/core/PlanNode.cpp index 4cea1590e704..29f7be33c9b9 100644 --- a/velox/core/PlanNode.cpp +++ b/velox/core/PlanNode.cpp @@ -1097,21 +1097,21 @@ folly::dynamic TableScanNode::serialize() const { PlanNodePtr TableScanNode::create(const folly::dynamic& obj, void* context) { auto planNodeId = obj["id"].asString(); auto outputType = deserializeRowType(obj["outputType"]); - auto tableHandle = std::const_pointer_cast( - ISerializable::deserialize( + auto tableHandle = std::const_pointer_cast( + ISerializable::deserialize( obj["tableHandle"], context)); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments; for (const auto& pair : obj["assignments"]) { auto assign = pair["assign"].asString(); auto columnHandle = - ISerializable::deserialize( + ISerializable::deserialize( pair["columnHandle"]); assignments[assign] = - std::const_pointer_cast(columnHandle); + std::const_pointer_cast(columnHandle); } return std::make_shared( @@ -2365,7 +2365,7 @@ folly::dynamic TableWriteNode::serialize() const { insertTableHandle_->connectorInsertTableHandle()->serialize(); obj["hasPartitioningScheme"] = hasPartitioningScheme_; obj["outputType"] = outputType_->serialize(); - obj["commitStrategy"] = connector::commitStrategyToString(commitStrategy_); + obj["commitStrategy"] = connector::common::commitStrategyToString(commitStrategy_); return obj; } @@ -2389,13 +2389,13 @@ PlanNodePtr TableWriteNode::create(const folly::dynamic& obj, void* context) { } auto connectorId = obj["connectorId"].asString(); auto connectorInsertTableHandle = - std::const_pointer_cast( - ISerializable::deserialize( + std::const_pointer_cast( + ISerializable::deserialize( obj["connectorInsertTableHandle"])); const bool hasPartitioningScheme = obj["hasPartitioningScheme"].asBool(); auto outputType = deserializeRowType(obj["outputType"]); auto commitStrategy = - connector::stringToCommitStrategy(obj["commitStrategy"].asString()); + connector::common::stringToCommitStrategy(obj["commitStrategy"].asString()); return std::make_shared( id, columns, diff --git a/velox/core/PlanNode.h b/velox/core/PlanNode.h index 35ea5e31dfd1..99a5502d1065 100644 --- a/velox/core/PlanNode.h +++ b/velox/core/PlanNode.h @@ -19,7 +19,7 @@ #include -#include "velox/connectors/Connector.h" +#include "velox/connectors/common/Connector.h" #include "velox/core/Expressions.h" #include "velox/core/QueryConfig.h" #include "velox/vector/VectorStream.h" @@ -38,7 +38,7 @@ struct InsertTableHandle { public: InsertTableHandle( const std::string& connectorId, - const std::shared_ptr& + const std::shared_ptr& connectorInsertTableHandle) : connectorId_(connectorId), connectorInsertTableHandle_(connectorInsertTableHandle) {} @@ -47,17 +47,17 @@ struct InsertTableHandle { return connectorId_; } - const std::shared_ptr& + const std::shared_ptr& connectorInsertTableHandle() const { return connectorInsertTableHandle_; } private: - // Connector ID + // connector::common::Connector ID const std::string connectorId_; // Write request to a DataSink of that connector type - const std::shared_ptr + const std::shared_ptr connectorInsertTableHandle_; }; @@ -877,10 +877,10 @@ class TableScanNode : public PlanNode { TableScanNode( const PlanNodeId& id, RowTypePtr outputType, - const std::shared_ptr& tableHandle, + const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& assignments) + std::shared_ptr>& assignments) : PlanNode(id), outputType_(std::move(outputType)), tableHandle_(tableHandle), @@ -908,7 +908,7 @@ class TableScanNode : public PlanNode { } Builder& tableHandle( - std::shared_ptr tableHandle) { + std::shared_ptr tableHandle) { tableHandle_ = std::move(tableHandle); return *this; } @@ -916,7 +916,7 @@ class TableScanNode : public PlanNode { Builder& assignments( std::unordered_map< std::string, - std::shared_ptr> assignments) { + std::shared_ptr> assignments) { assignments_ = std::move(assignments); return *this; } @@ -940,11 +940,11 @@ class TableScanNode : public PlanNode { private: std::optional id_; std::optional outputType_; - std::optional> + std::optional> tableHandle_; std::optional>> + std::shared_ptr>> assignments_; }; @@ -965,13 +965,13 @@ class TableScanNode : public PlanNode { return true; } - const std::shared_ptr& tableHandle() const { + const std::shared_ptr& tableHandle() const { return tableHandle_; } const std::unordered_map< std::string, - std::shared_ptr>& + std::shared_ptr>& assignments() const { return assignments_; } @@ -988,10 +988,10 @@ class TableScanNode : public PlanNode { void addDetails(std::stringstream& stream) const override; const RowTypePtr outputType_; - const std::shared_ptr tableHandle_; + const std::shared_ptr tableHandle_; const std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments_; }; @@ -1324,7 +1324,7 @@ class TableWriteNode : public PlanNode { std::shared_ptr insertTableHandle, bool hasPartitioningScheme, RowTypePtr outputType, - connector::CommitStrategy commitStrategy, + connector::common::CommitStrategy commitStrategy, const PlanNodePtr& source) : PlanNode(id), sources_{source}, @@ -1398,7 +1398,7 @@ class TableWriteNode : public PlanNode { return *this; } - Builder& commitStrategy(connector::CommitStrategy commitStrategy) { + Builder& commitStrategy(connector::common::CommitStrategy commitStrategy) { commitStrategy_ = commitStrategy; return *this; } @@ -1451,7 +1451,7 @@ class TableWriteNode : public PlanNode { std::optional> insertTableHandle_; std::optional hasPartitioningScheme_; std::optional outputType_; - std::optional commitStrategy_; + std::optional commitStrategy_; std::optional source_; }; @@ -1491,7 +1491,7 @@ class TableWriteNode : public PlanNode { return hasPartitioningScheme_; } - connector::CommitStrategy commitStrategy() const { + connector::common::CommitStrategy commitStrategy() const { return commitStrategy_; } @@ -1522,7 +1522,7 @@ class TableWriteNode : public PlanNode { const std::shared_ptr insertTableHandle_; const bool hasPartitioningScheme_; const RowTypePtr outputType_; - const connector::CommitStrategy commitStrategy_; + const connector::common::CommitStrategy commitStrategy_; }; class TableWriteMergeNode : public PlanNode { @@ -3215,7 +3215,7 @@ using BetweenIndexLookupConditionPtr = /// lookup table. Each join condition must use columns from both sides. For the /// right side, it can only use one index column. Each index column can either /// be a join key or a join condition once. The table scan node of the right -/// input is translated to a connector::IndexSource within +/// input is translated to a connector::common::IndexSource within /// exec::IndexLookupJoin. Only INNER and LEFT joins are supported. /// /// Take the following query for example, 't' is left table, 'u' is the right diff --git a/velox/core/tests/PlanNodeBuilderTest.cpp b/velox/core/tests/PlanNodeBuilderTest.cpp index e630485b2fed..6b7ae3b88b0a 100644 --- a/velox/core/tests/PlanNodeBuilderTest.cpp +++ b/velox/core/tests/PlanNodeBuilderTest.cpp @@ -42,12 +42,12 @@ class PlanNodeBuilderTest : public testing::Test, public test::VectorTestBase { .build(); }; -// A dummy implementation of ConnectorTableHandle that supports index lookup. +// A dummy implementation of connector::common::ConnectorTableHandle that supports index lookup. class TestConnectorTableHandleForLookupJoin - : public connector::ConnectorTableHandle { + : public connector::common::ConnectorTableHandle { public: explicit TestConnectorTableHandleForLookupJoin(std::string connectorId) - : connector::ConnectorTableHandle(std::move(connectorId)) {} + : connector::common::ConnectorTableHandle(std::move(connectorId)) {} bool supportsIndexLookup() const override { return true; @@ -185,13 +185,13 @@ TEST_F(PlanNodeBuilderTest, TableScanNode) { const PlanNodeId id = "table_scan_node_id"; const RowTypePtr outputType = ROW({"c0", "c1"}, {INTEGER(), VARCHAR()}); const auto tableHandle = - std::make_shared("connector_id"); + std::make_shared("connector_id"); const std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments{ - {"c0", std::make_shared()}, - {"c1", std::make_shared()}}; + {"c0", std::make_shared()}, + {"c1", std::make_shared()}}; const auto verify = [&](const std::shared_ptr& node) { EXPECT_EQ(node->id(), id); @@ -270,7 +270,7 @@ TEST_F(PlanNodeBuilderTest, TableWriteNode) { const std::vector columnNames{"c0"}; const RowTypePtr outputType = ROW({"c1"}, {BIGINT()}); const bool hasPartitioningScheme = true; - const auto commitStrategy = connector::CommitStrategy::kNoCommit; + const auto commitStrategy = connector::common::CommitStrategy::kNoCommit; const auto aggregationNode = AggregationNode::Builder() .id("aggregation_node_id") @@ -690,7 +690,7 @@ TEST_F(PlanNodeBuilderTest, IndexLookupJoinNode) { .tableHandle(std::make_shared( "connector_id")) .assignments( - {{"c1", std::make_shared()}}) + {{"c1", std::make_shared()}}) .build(); const auto outputType = ROW({"c0"}, {BIGINT()}); diff --git a/velox/core/tests/PlanNodeTest.cpp b/velox/core/tests/PlanNodeTest.cpp index 27723f7eef13..22a5d8483df2 100644 --- a/velox/core/tests/PlanNodeTest.cpp +++ b/velox/core/tests/PlanNodeTest.cpp @@ -45,10 +45,10 @@ class PlanNodeTest : public testing::Test, public test::VectorTestBase { TEST_F(PlanNodeTest, findFirstNode) { auto rowType = ROW({"name1"}, {BIGINT()}); - std::shared_ptr tableHandle; + std::shared_ptr tableHandle; std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments; std::shared_ptr tableScan3 = @@ -135,10 +135,10 @@ TEST_F(PlanNodeTest, duplicateSortKeys) { "orderBy", sortingKeys, sortingOrders, false, nullptr), "Duplicate sorting keys are not allowed: c0"); } -class TestIndexTableHandle : public connector::ConnectorTableHandle { +class TestIndexTableHandle : public connector::common::ConnectorTableHandle { public: TestIndexTableHandle() - : connector::ConnectorTableHandle("TestIndexConnnector") {} + : connector::common::ConnectorTableHandle("TestIndexConnnector") {} ~TestIndexTableHandle() override = default; @@ -181,7 +181,7 @@ TEST_F(PlanNodeTest, isIndexLookupJoin) { nullptr, std::unordered_map< std::string, - std::shared_ptr>{}); + std::shared_ptr>{}); ASSERT_FALSE(isIndexLookupJoin(probeNode.get())); const auto buildNode = std::make_shared( "tableScan-build", @@ -189,7 +189,7 @@ TEST_F(PlanNodeTest, isIndexLookupJoin) { indexTableHandle, std::unordered_map< std::string, - std::shared_ptr>{}); + std::shared_ptr>{}); ASSERT_FALSE(isIndexLookupJoin(buildNode.get())); const std::vector leftKeys{ std::make_shared(BIGINT(), "c0")}; diff --git a/velox/dwio/common/ColumnSelector.cpp b/velox/dwio/common/ColumnSelector.cpp index 158eaa05cfab..e6ea69ffc595 100644 --- a/velox/dwio/common/ColumnSelector.cpp +++ b/velox/dwio/common/ColumnSelector.cpp @@ -124,9 +124,9 @@ FilterTypePtr ColumnSelector::buildNode( // this copy method only update inContent and data type // based on disk data type void ColumnSelector::copy( - common::FilterTypePtr& node, + dwio::common::FilterTypePtr& node, const std::shared_ptr& diskType, - const common::FilterTypePtr& origin) { + const dwio::common::FilterTypePtr& origin) { auto originIsNull = (origin == nullptr); if (!originIsNull) { node->setInContent(origin->isInContent()); @@ -147,8 +147,8 @@ void ColumnSelector::copy( // update data type during the visit as well as other data fields node->setDataType(diskType); if (!originIsNull) { - const common::FilterNode& f = origin->getNode(); - auto& fn = const_cast(node->getNode()); + const dwio::common::FilterNode& f = origin->getNode(); + auto& fn = const_cast(node->getNode()); fn.expression = f.expression; fn.partitionKey = f.partitionKey; } @@ -156,7 +156,7 @@ void ColumnSelector::copy( // visit all children for (size_t i = 0; i < node->size(); ++i) { copy( - const_cast(node->childAt(i)), + const_cast(node->childAt(i)), i < diskType->size() ? diskType->childAt(i) : nullptr, originIsNull ? nullptr : origin->childAt(i)); } @@ -189,7 +189,7 @@ ColumnSelector ColumnSelector::apply( * * @param node the starting id */ -void ColumnSelector::setRead(const common::FilterTypePtr& node, bool only) { +void ColumnSelector::setRead(const dwio::common::FilterTypePtr& node, bool only) { if (!node->valid()) { return; } @@ -302,7 +302,7 @@ const FilterTypePtr& ColumnSelector::process(const std::string& column, bool) { // set expression for this node auto& nodeValue = node->getNode(); - const_cast(nodeValue).expression = expr; + const_cast(nodeValue).expression = expr; } return node; } diff --git a/velox/dwio/common/FileSink.h b/velox/dwio/common/FileSink.h index d6aa27a56613..7fd22afe2267 100644 --- a/velox/dwio/common/FileSink.h +++ b/velox/dwio/common/FileSink.h @@ -35,7 +35,7 @@ class FileSink : public Closeable { struct Options { /// If true, allows file sink to buffer data before persist to storage. bool bufferWrite{true}; - /// Connector properties are required to create a FileSink on FileSystems + /// connector::common::Connector properties are required to create a FileSink on FileSystems /// such as S3. const std::shared_ptr& connectorProperties{ nullptr}; diff --git a/velox/dwio/common/Options.cpp b/velox/dwio/common/Options.cpp index f66047a31041..04ca0509547f 100644 --- a/velox/dwio/common/Options.cpp +++ b/velox/dwio/common/Options.cpp @@ -18,6 +18,8 @@ namespace facebook::velox::dwio::common { +using namespace velox::common; + FileFormat toFileFormat(std::string_view s) { if (s == "dwrf") { return FileFormat::DWRF; @@ -70,4 +72,89 @@ std::string_view toString(FileFormat fmt) { } } +folly::dynamic WriterOptions::serialize() const { + folly::dynamic obj = folly::dynamic::object; + + if (schema) { + obj["schema"] = schema->serialize(); + } + + if (spillConfig) { + // TODO + // obj["spillConfig"] = spillConfig->serialize(); + } + + if (nonReclaimableSection) { + obj["nonReclaimableSection"] = *nonReclaimableSection; + } + + // TODO: serialize memoryReclaimerFactory + + if (compressionKind) { + obj["compressionKind"] = static_cast(*compressionKind); + } + + if (!serdeParameters.empty()) { + folly::dynamic serdeObj = folly::dynamic::object; + for (auto& kv : serdeParameters) { + serdeObj[kv.first] = kv.second; + } + obj["serdeParameters"] = std::move(serdeObj); + } + + // TODO: serialize flushPolicyFactory + + obj["sessionTimezoneName"] = sessionTimezoneName; + obj["adjustTimestampToTimezone"] = adjustTimestampToTimezone; + + return obj; +} + +std::unique_ptr WriterOptions::deserialize( + const folly::dynamic& obj) { + auto opts = std::make_unique(); + + if (auto schema = obj.get_ptr("schema")) { + opts->schema = ISerializable::deserialize(*schema); + // opts->schema = Type::deserialize(schema); + } + + if (auto spillConfig = obj.get_ptr("spillConfig")) { + // TODO +// opts->spillConfig = ISerializable::deserialize(*spillConfig); + } + + if (auto nonReclaimableSection = obj.get_ptr("nonReclaimableSection")) { + // you need to supply an actual atomic somewhere; here we just allocate one: + opts->nonReclaimableSection = + new tsan_atomic(nonReclaimableSection->asBool()); + } + + // TODO: deserialize memoryReclaimerFactory + + if (auto compressionKind = obj.get_ptr("compressionKind")) { + opts->compressionKind = + static_cast(compressionKind->asInt()); + } + + if (auto serdeParameters = obj.get_ptr("serdeParameters")) { + for (auto& kv : serdeParameters->items()) { + opts->serdeParameters[kv.first.asString()] = kv.second.asString(); + } + } + + // TODO: deserialize flushPolicyFactory + + if (auto sessionTimezoneName = obj.get_ptr("sessionTimezoneName")) { + opts->sessionTimezoneName = sessionTimezoneName->asString(); + } + + if (auto adjustTimestampToTimezone = + obj.get_ptr("adjustTimestampToTimezone")) { + opts->adjustTimestampToTimezone = adjustTimestampToTimezone->asBool(); + } + + return opts; +} + } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/Options.h b/velox/dwio/common/Options.h index 8f45ed3c4294..5db754e04617 100644 --- a/velox/dwio/common/Options.h +++ b/velox/dwio/common/Options.h @@ -629,7 +629,7 @@ class ReaderOptions : public io::ReaderOptions { bool selectiveNimbleReaderEnabled_{false}; }; -struct WriterOptions { +struct WriterOptions : public ISerializable { TypePtr schema{nullptr}; velox::memory::MemoryPool* memoryPool{nullptr}; const velox::common::SpillConfig* spillConfig{nullptr}; @@ -658,6 +658,9 @@ struct WriterOptions { const config::ConfigBase& session) {} virtual ~WriterOptions() = default; + + folly::dynamic serialize() const override; + static std::unique_ptr deserialize(const folly::dynamic& obj); }; } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/ScanSpec.cpp b/velox/dwio/common/ScanSpec.cpp index ee93fd0bd102..c32c2a88a6f6 100644 --- a/velox/dwio/common/ScanSpec.cpp +++ b/velox/dwio/common/ScanSpec.cpp @@ -204,7 +204,7 @@ void ScanSpec::moveAdaptationFrom(ScanSpec& other) { namespace { bool testIntFilter( - common::Filter* filter, + velox::common::Filter* filter, dwio::common::IntegerColumnStatistics* intStats, bool mayHaveNull) { if (!intStats) { @@ -239,7 +239,7 @@ bool testIntFilter( } bool testDoubleFilter( - common::Filter* filter, + velox::common::Filter* filter, dwio::common::DoubleColumnStatistics* doubleStats, bool mayHaveNull) { if (!doubleStats) { @@ -274,7 +274,7 @@ bool testDoubleFilter( } bool testStringFilter( - common::Filter* filter, + velox::common::Filter* filter, dwio::common::StringColumnStatistics* stringStats, bool mayHaveNull) { if (!stringStats) { @@ -304,7 +304,7 @@ bool testStringFilter( } bool testBoolFilter( - common::Filter* filter, + velox::common::Filter* filter, dwio::common::BooleanColumnStatistics* boolStats) { const auto trueCount = boolStats->getTrueCount(); const auto falseCount = boolStats->getFalseCount(); @@ -325,7 +325,7 @@ bool testBoolFilter( } // namespace bool testFilter( - common::Filter* filter, + velox::common::Filter* filter, dwio::common::ColumnStatistics* stats, uint64_t totalRows, const TypePtr& type) { @@ -343,7 +343,7 @@ bool testFilter( mayHaveNull = stats->getNumberOfValues().value() < totalRows; } - if (!mayHaveNull && filter->kind() == common::FilterKind::kIsNull) { + if (!mayHaveNull && filter->kind() == velox::common::FilterKind::kIsNull) { // IS NULL filter cannot pass. return false; } diff --git a/velox/dwio/common/ScanSpec.h b/velox/dwio/common/ScanSpec.h index a040fc5316bd..2b51266c2e10 100644 --- a/velox/dwio/common/ScanSpec.h +++ b/velox/dwio/common/ScanSpec.h @@ -57,7 +57,7 @@ class ScanSpec { // Filter to apply. If 'this' corresponds to a struct/list/map, this // can only be isNull or isNotNull, other filtering is given by // 'children'. - common::Filter* filter() const { + velox::common::Filter* filter() const { return filterDisabled_ ? nullptr : filter_.get(); } @@ -79,7 +79,7 @@ class ScanSpec { void addMetadataFilter( const MetadataFilter::LeafNode* leaf, - common::Filter* filter) { + velox::common::Filter* filter) { metadataFilters_.emplace_back(leaf, filter); } @@ -91,7 +91,7 @@ class ScanSpec { return metadataFilters_[i].first; } - common::Filter* metadataFilterAt(int i) const { + velox::common::Filter* metadataFilterAt(int i) const { return metadataFilters_[i].second; } @@ -422,7 +422,7 @@ class ScanSpec { // True if a string dictionary or flat map in this field should be // returned as flat. bool makeFlat_ = false; - std::unique_ptr filter_; + std::unique_ptr filter_; bool filterDisabled_ = false; dwio::common::DeltaColumnUpdater* deltaUpdate_ = nullptr; @@ -431,7 +431,7 @@ class ScanSpec { // the pointers to LeafNodes are stored here. We need to keep these pointers // so that we can match the leaf node filter results and apply logical // conjunctions later properly. - std::vector> + std::vector> metadataFilters_; SelectivityInfo selectivity_; @@ -495,7 +495,7 @@ void ScanSpec::visit(const Type& type, F&& f) { // Returns false if no value from a range defined by stats can pass the // filter. True, otherwise. bool testFilter( - common::Filter* filter, + velox::common::Filter* filter, dwio::common::ColumnStatistics* stats, uint64_t totalRows, const TypePtr& type); diff --git a/velox/dwio/common/SelectiveStructColumnReader.cpp b/velox/dwio/common/SelectiveStructColumnReader.cpp index d286fa27e38b..bef2fe1a55b8 100644 --- a/velox/dwio/common/SelectiveStructColumnReader.cpp +++ b/velox/dwio/common/SelectiveStructColumnReader.cpp @@ -256,7 +256,7 @@ void SelectiveStructColumnReaderBase::next( const Mutation* mutation) { process::TraceContext trace("SelectiveStructColumnReaderBase::next"); mutation_ = mutation; - hasDeletion_ = common::hasDeletion(mutation); + hasDeletion_ = dwio::common::hasDeletion(mutation); const RowSet rows(iota(numValues, rows_), numValues); if (!children_.empty()) { diff --git a/velox/dwio/common/tests/ReaderTest.cpp b/velox/dwio/common/tests/ReaderTest.cpp index 927ac22f8808..581c33b122da 100644 --- a/velox/dwio/common/tests/ReaderTest.cpp +++ b/velox/dwio/common/tests/ReaderTest.cpp @@ -40,11 +40,11 @@ TEST_F(ReaderTest, getOrCreateChild) { makeFlatVector({2, 4, 6, 7, 8}), }); - common::ScanSpec spec(""); + velox::common::ScanSpec spec(""); spec.addField("c.0", 0); // Create child from name. spec.getOrCreateChild("c.1")->setFilter( - common::createBigintValues({2, 4, 6}, false)); + velox::common::createBigintValues({2, 4, 6}, false)); auto actual = RowReader::projectColumns(input, spec, nullptr); auto expected = makeRowVector({ @@ -53,8 +53,8 @@ TEST_F(ReaderTest, getOrCreateChild) { test::assertEqualVectors(expected, actual); // Create child from subfield. - spec.getOrCreateChild(common::Subfield("c.1")) - ->setFilter(common::createBigintValues({2, 4, 6}, false)); + spec.getOrCreateChild(velox::common::Subfield("c.1")) + ->setFilter(velox::common::createBigintValues({2, 4, 6}, false)); VELOX_ASSERT_USER_THROW( RowReader::projectColumns(input, spec, nullptr), "Field not found: c. Available fields are: c.0, c.1."); @@ -68,10 +68,10 @@ TEST_F(ReaderTest, projectColumnsFilterStruct) { makeFlatVector(kSize, folly::identity), }), }); - common::ScanSpec spec(""); + velox::common::ScanSpec spec(""); spec.addField("c0", 0); - spec.getOrCreateChild(common::Subfield("c1.c0")) - ->setFilter(common::createBigintValues({2, 4, 6}, false)); + spec.getOrCreateChild(velox::common::Subfield("c1.c0")) + ->setFilter(velox::common::createBigintValues({2, 4, 6}, false)); auto actual = RowReader::projectColumns(input, spec, nullptr); auto expected = makeRowVector({ makeFlatVector({2, 4, 6}), @@ -89,9 +89,9 @@ TEST_F(ReaderTest, projectColumnsFilterArray) { [](auto i) { return i; }, [](auto i) { return i % 2 != 0; }), }); - common::ScanSpec spec(""); + velox::common::ScanSpec spec(""); spec.addField("c0", 0); - auto* c1 = spec.getOrCreateChild(common::Subfield("c1")); + auto* c1 = spec.getOrCreateChild(velox::common::Subfield("c1")); { SCOPED_TRACE("IS NULL"); c1->setFilter(std::make_unique()); @@ -115,7 +115,7 @@ TEST_F(ReaderTest, projectColumnsFilterArray) { TEST_F(ReaderTest, projectColumnsMutation) { constexpr int kSize = 10; auto input = makeRowVector({makeFlatVector(kSize, folly::identity)}); - common::ScanSpec spec(""); + velox::common::ScanSpec spec(""); spec.addAllChildFields(*input->type()); std::vector deleted(bits::nwords(kSize)); bits::setBit(deleted.data(), 2); diff --git a/velox/dwio/common/tests/utils/DataSetBuilder.cpp b/velox/dwio/common/tests/utils/DataSetBuilder.cpp index 85c508dd0a3b..edf00df847de 100644 --- a/velox/dwio/common/tests/utils/DataSetBuilder.cpp +++ b/velox/dwio/common/tests/utils/DataSetBuilder.cpp @@ -113,7 +113,7 @@ DataSetBuilder& DataSetBuilder::withNoNullsAfter(int32_t firstRow) { } DataSetBuilder& DataSetBuilder::withAllNullsForField( - const common::Subfield& field) { + const velox::common::Subfield& field) { for (RowVectorPtr batch : *batches_) { auto fieldValues = getChildBySubfield(batch.get(), field); SelectivityVector rows(fieldValues->size()); @@ -124,7 +124,7 @@ DataSetBuilder& DataSetBuilder::withAllNullsForField( } DataSetBuilder& DataSetBuilder::withNullsForField( - const common::Subfield& field, + const velox::common::Subfield& field, uint8_t nullsPercent) { for (RowVectorPtr batch : *batches_) { auto fieldValues = getChildBySubfield(batch.get(), field); @@ -217,7 +217,7 @@ DataSetBuilder& DataSetBuilder::withUniqueStringsForField( } DataSetBuilder& DataSetBuilder::makeUniformMapKeys( - const common::Subfield& field) { + const velox::common::Subfield& field) { for (auto& batch : *batches_) { auto* map = dwio::common::getChildBySubfield(batch.get(), field) ->asUnchecked(); @@ -247,7 +247,7 @@ DataSetBuilder& DataSetBuilder::makeUniformMapKeys( } DataSetBuilder& DataSetBuilder::makeMapStringValues( - const common::Subfield& field) { + const velox::common::Subfield& field) { for (auto& batch : *batches_) { auto* map = dwio::common::getChildBySubfield(batch.get(), field) ->asUnchecked(); diff --git a/velox/dwio/common/tests/utils/DataSetBuilder.h b/velox/dwio/common/tests/utils/DataSetBuilder.h index e0111be1d552..537ae1cf28da 100644 --- a/velox/dwio/common/tests/utils/DataSetBuilder.h +++ b/velox/dwio/common/tests/utils/DataSetBuilder.h @@ -57,16 +57,16 @@ class DataSetBuilder { DataSetBuilder& withNoNullsAfter(int32_t firstRow = 0); // Make all rows for the specific Subfield field null - DataSetBuilder& withAllNullsForField(const common::Subfield& field); + DataSetBuilder& withAllNullsForField(const velox::common::Subfield& field); // Make the data for the specific Subfield field with nulls at // nullsPercentX100 % DataSetBuilder& withNullsForField( - const common::Subfield& field, + const velox::common::Subfield& field, uint8_t nullsPercentX100); DataSetBuilder& withStringDistributionForField( - const common::Subfield& field, + const velox::common::Subfield& field, int cardinality, bool keepNulls, bool addOneOffs); @@ -78,7 +78,7 @@ class DataSetBuilder { template DataSetBuilder& withIntDistributionForField( - const common::Subfield& field, + const velox::common::Subfield& field, int64_t min, int64_t max, int32_t repeats, @@ -117,7 +117,7 @@ class DataSetBuilder { } template - DataSetBuilder& withIntRleForField(const common::Subfield& field) { + DataSetBuilder& withIntRleForField(const velox::common::Subfield& field) { constexpr int kMinRun = 5; constexpr int kMaxRun = 101; int remaining = 0; @@ -143,7 +143,7 @@ class DataSetBuilder { } template - DataSetBuilder& withIntMainlyConstantForField(const common::Subfield& field) { + DataSetBuilder& withIntMainlyConstantForField(const velox::common::Subfield& field) { for (auto& batch : *batches_) { std::optional value; auto* numbers = dwio::common::getChildBySubfield(batch.get(), field) @@ -166,7 +166,7 @@ class DataSetBuilder { template DataSetBuilder& withQuantizedFloatForField( - const common::Subfield& field, + const velox::common::Subfield& field, int64_t buckets, bool keepNulls) { for (RowVectorPtr batch : *batches_) { @@ -186,7 +186,7 @@ class DataSetBuilder { template DataSetBuilder& withReapeatingValuesForField( - const common::Subfield& field, + const velox::common::Subfield& field, int32_t batchIndex, int32_t firstRow, int32_t lastRow, @@ -211,7 +211,7 @@ class DataSetBuilder { template void withSuppliedValuesForField( - const common::Subfield& field, + const velox::common::Subfield& field, int32_t batchIndex, const std::vector& values) { VELOX_CHECK_LT(batchIndex, batches_->size()); @@ -227,11 +227,11 @@ class DataSetBuilder { } } - DataSetBuilder& makeUniformMapKeys(const common::Subfield& field); + DataSetBuilder& makeUniformMapKeys(const velox::common::Subfield& field); // Ensures that there are non-inlined various string sizes in map keys/values // if either key or value is a string. - DataSetBuilder& makeMapStringValues(const common::Subfield& field); + DataSetBuilder& makeMapStringValues(const velox::common::Subfield& field); std::unique_ptr> build(); diff --git a/velox/dwio/common/tests/utils/E2EFilterTestBase.cpp b/velox/dwio/common/tests/utils/E2EFilterTestBase.cpp index 446aaceee766..e89b1f8446b5 100644 --- a/velox/dwio/common/tests/utils/E2EFilterTestBase.cpp +++ b/velox/dwio/common/tests/utils/E2EFilterTestBase.cpp @@ -281,7 +281,7 @@ bool E2EFilterTestBase::loadWithHook( } void E2EFilterTestBase::testReadWithFilterLazy( - const std::shared_ptr& spec, + const std::shared_ptr& spec, const MutationSpec& mutations, const std::vector& batches, const std::vector& hitRows) { @@ -486,13 +486,13 @@ void E2EFilterTestBase::testRunLengthDictionaryScenario( void E2EFilterTestBase::testMetadataFilterImpl( const std::vector& batches, - common::Subfield filterField, - std::unique_ptr filter, + velox::common::Subfield filterField, + std::unique_ptr filter, core::ExpressionEvaluator* evaluator, const std::string& remainingFilter, std::function validationFilter) { SCOPED_TRACE(fmt::format("remainingFilter={}", remainingFilter)); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); if (filter) { spec->getOrCreateChild(std::move(filterField)) ->setFilter(std::move(filter)); @@ -502,9 +502,9 @@ void E2EFilterTestBase::testMetadataFilterImpl( untypedExpr, batches[0]->type(), leafPool_.get()); auto metadataFilter = std::make_shared(*spec, *typedExpr, evaluator); - auto specA = spec->getOrCreateChild(common::Subfield("a")); - auto specB = spec->getOrCreateChild(common::Subfield("b")); - auto specC = spec->getOrCreateChild(common::Subfield("b.c")); + auto specA = spec->getOrCreateChild(velox::common::Subfield("a")); + auto specB = spec->getOrCreateChild(velox::common::Subfield("b")); + auto specC = spec->getOrCreateChild(velox::common::Subfield("b.c")); specA->setProjectOut(true); specA->setChannel(0); specB->setProjectOut(true); @@ -591,14 +591,14 @@ void E2EFilterTestBase::testMetadataFilter() { testMetadataFilterImpl( batches, - common::Subfield("a"), + velox::common::Subfield("a"), nullptr, &evaluator, "a >= 9 or not (a < 4 and b.c >= 2)", [](int64_t a, int64_t c) { return a >= 9 || !(a < 4 && c >= 2); }); testMetadataFilterImpl( batches, - common::Subfield("a"), + velox::common::Subfield("a"), exec::greaterThanOrEqual(1), &evaluator, "a >= 9 or not (a < 4 and b.c >= 2)", @@ -607,14 +607,14 @@ void E2EFilterTestBase::testMetadataFilter() { }); testMetadataFilterImpl( batches, - common::Subfield("a"), + velox::common::Subfield("a"), nullptr, &evaluator, "a in (1, 3, 8) or a >= 9", [](int64_t a, int64_t) { return a == 1 || a == 3 || a == 8 || a >= 9; }); testMetadataFilterImpl( batches, - common::Subfield("a"), + velox::common::Subfield("a"), nullptr, &evaluator, "not (a not in (2, 3, 5, 7))", @@ -631,7 +631,7 @@ void E2EFilterTestBase::testMetadataFilter() { writeToMemory(batches[0]->type(), batches, false); testMetadataFilterImpl( batches, - common::Subfield("a"), + velox::common::Subfield("a"), nullptr, &evaluator, "not (a = 1 and b.c = 2)", @@ -643,7 +643,7 @@ void E2EFilterTestBase::testMetadataFilter() { batches = { vectorMaker.rowVector({"a", "b", "c"}, {column, column, column})}; writeToMemory(batches[0]->type(), batches, false); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addAllChildFields(*batches[0]->type()); auto untypedExpr = parse::parseExpr("a = 1 or b + c = 2", {}); auto typedExpr = core::Expressions::inferTypes( @@ -685,7 +685,7 @@ void E2EFilterTestBase::testSubfieldsPruning() { vectorMaker.rowVector({"a", "b", "c", "d"}, {a, b, c, d})); } writeToMemory(batches[0]->type(), batches, false); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); std::vector requiredA; for (int i = 0; i <= batchSize_ + batchCount_ - 2; ++i) { if (i % 13 != 0) { @@ -693,20 +693,20 @@ void E2EFilterTestBase::testSubfieldsPruning() { } } spec->addFieldRecursively("a", *BIGINT(), 0) - ->setFilter(common::createBigintValues(requiredA, false)); + ->setFilter(velox::common::createBigintValues(requiredA, false)); std::vector requiredB; for (int i = 0; i < kMapSize; i += 2) { requiredB.push_back(i); } auto specB = spec->addFieldRecursively("b", *MAP(BIGINT(), BIGINT()), 1); specB->setFilter(exec::isNotNull()); - specB->childByName(common::ScanSpec::kMapKeysFieldName) - ->setFilter(common::createBigintValues(requiredB, false)); + specB->childByName(velox::common::ScanSpec::kMapKeysFieldName) + ->setFilter(velox::common::createBigintValues(requiredB, false)); spec->addFieldRecursively("c", *ARRAY(BIGINT()), 2) ->setMaxArrayElementsCount(6); auto specD = spec->addFieldRecursively("d", *MAP(BIGINT(), VARCHAR()), 3); - specD->childByName(common::ScanSpec::kMapKeysFieldName) - ->setFilter(common::createBigintValues({1}, false)); + specD->childByName(velox::common::ScanSpec::kMapKeysFieldName) + ->setFilter(velox::common::createBigintValues({1}, false)); ReaderOptions readerOpts{leafPool_.get()}; RowReaderOptions rowReaderOpts; auto input = std::make_unique( @@ -780,7 +780,7 @@ void E2EFilterTestBase::testMutationCornerCases() { // 1. Interleave batches with and without deletions. // 2. Whole batch deletion. // 3. Delete last a few rows in a batch. - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addAllChildFields(*rowType); RowReaderOptions rowReaderOpts; setUpRowReaderOptions(rowReaderOpts, spec); @@ -838,7 +838,7 @@ void E2EFilterTestBase::testMutationCornerCases() { ASSERT_EQ(totalScanned, 1000); // No child reader. - spec = std::make_shared(""); + spec = std::make_shared(""); setUpRowReaderOptions(rowReaderOpts, spec); rowReader = reader->createRowReader(rowReaderOpts); result = BaseVector::create(ROW({}), 0, leafPool_.get()); diff --git a/velox/dwio/common/tests/utils/E2EFilterTestBase.h b/velox/dwio/common/tests/utils/E2EFilterTestBase.h index 16d30e36a7b7..d5d2f7803f2c 100644 --- a/velox/dwio/common/tests/utils/E2EFilterTestBase.h +++ b/velox/dwio/common/tests/utils/E2EFilterTestBase.h @@ -106,7 +106,7 @@ class E2EFilterTestBase : public testing::Test { void SetUp() override { rootPool_ = memory::memoryManager()->addRootPool("E2EFilterTestBase"); leafPool_ = rootPool_->addLeafChild("E2EFilterTestBase"); - seed_ = common::testutil::getRandomSeed(seed_); + seed_ = velox::common::testutil::getRandomSeed(seed_); } static bool typeKindSupportsValueHook(TypeKind kind) { @@ -222,12 +222,12 @@ class E2EFilterTestBase : public testing::Test { } void readWithoutFilter( - std::shared_ptr spec, + std::shared_ptr spec, const std::vector& batches, uint64_t& time); void readWithFilter( - std::shared_ptr spec, + std::shared_ptr spec, const MutationSpec&, const std::vector& batches, const std::vector& hitRows, @@ -261,7 +261,7 @@ class E2EFilterTestBase : public testing::Test { auto reference = batches[common::batchNumber(hitRows[row])] ->childAt(columnIndex) ->as>(); - auto referenceIndex = common::batchRow(hitRows[row]); + auto referenceIndex = dwio::common::batchRow(hitRows[row]); if (reference->isNullAt(referenceIndex)) { continue; // The hook is not called on nulls. } @@ -299,7 +299,7 @@ class E2EFilterTestBase : public testing::Test { private: void testReadWithFilterLazy( - const std::shared_ptr& spec, + const std::shared_ptr& spec, const MutationSpec&, const std::vector& batches, const std::vector& hitRows); @@ -330,8 +330,8 @@ class E2EFilterTestBase : public testing::Test { private: void testMetadataFilterImpl( const std::vector& batches, - common::Subfield filterField, - std::unique_ptr filter, + velox::common::Subfield filterField, + std::unique_ptr filter, core::ExpressionEvaluator*, const std::string& remainingFilter, std::function validationFilter); @@ -360,7 +360,7 @@ class E2EFilterTestBase : public testing::Test { const size_t kBatchSize = 25'000; std::unique_ptr dataSetBuilder_; - std::unique_ptr filterGenerator_; + std::unique_ptr filterGenerator_; std::shared_ptr rootPool_; std::shared_ptr leafPool_; std::shared_ptr rowType_; diff --git a/velox/dwio/dwrf/common/ByteRLE.cpp b/velox/dwio/dwrf/common/ByteRLE.cpp index 23829b721b7f..74fa175f9b0d 100644 --- a/velox/dwio/dwrf/common/ByteRLE.cpp +++ b/velox/dwio/dwrf/common/ByteRLE.cpp @@ -34,17 +34,17 @@ class ByteRleEncoderImpl : public ByteRleEncoder { uint64_t add( const char* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) override; uint64_t add( const std::function& valueAt, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const std::function& isNullAt) override; uint64_t addBits( const uint64_t* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls, bool invert) override { throw std::runtime_error("addBits is only for bool stream"); @@ -52,7 +52,7 @@ class ByteRleEncoderImpl : public ByteRleEncoder { uint64_t addBits( const std::function& isNullAt, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const std::function& valueAt, bool invert) override { throw std::runtime_error("addBits is only for bool stream"); @@ -96,7 +96,7 @@ void ByteRleEncoderImpl::writeByte(char c) { uint64_t ByteRleEncoderImpl::add( const char* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) { uint64_t count = 0; if (nulls) { @@ -117,7 +117,7 @@ uint64_t ByteRleEncoderImpl::add( uint64_t ByteRleEncoderImpl::add( const std::function& valueAt, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const std::function& isNullAt) { uint64_t count = 0; if (isNullAt) { @@ -219,18 +219,18 @@ class BooleanRleEncoderImpl : public ByteRleEncoderImpl { uint64_t add( const char* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) override; uint64_t addBits( const uint64_t* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls, bool invert) override; uint64_t addBits( const std::function& isNullAt, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const std::function& valueAt, bool invert) override; @@ -268,7 +268,7 @@ class BooleanRleEncoderImpl : public ByteRleEncoderImpl { uint64_t BooleanRleEncoderImpl::add( const char* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) { uint64_t count = 0; if (nulls) { @@ -289,7 +289,7 @@ uint64_t BooleanRleEncoderImpl::add( uint64_t BooleanRleEncoderImpl::addBits( const uint64_t* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls, bool invert) { uint64_t count = 0; @@ -313,7 +313,7 @@ uint64_t BooleanRleEncoderImpl::addBits( uint64_t BooleanRleEncoderImpl::addBits( const std::function& valueAt, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const std::function& isNullAt, bool invert) { uint64_t count = 0; diff --git a/velox/dwio/dwrf/common/ByteRLE.h b/velox/dwio/dwrf/common/ByteRLE.h index ea7ff790e843..e0e4308ac238 100644 --- a/velox/dwio/dwrf/common/ByteRLE.h +++ b/velox/dwio/dwrf/common/ByteRLE.h @@ -46,12 +46,12 @@ class ByteRleEncoder { */ virtual uint64_t add( const char* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) = 0; virtual uint64_t add( const std::function& valueAt, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const std::function& isNullAt) = 0; /** @@ -64,13 +64,13 @@ class ByteRleEncoder { */ virtual uint64_t addBits( const uint64_t* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls, bool invert) = 0; virtual uint64_t addBits( const std::function& valueAt, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const std::function& isNullAt, bool invert) = 0; diff --git a/velox/dwio/dwrf/common/Compression.h b/velox/dwio/dwrf/common/Compression.h index 0be6a4e0e417..afe98c3a82a9 100644 --- a/velox/dwio/dwrf/common/Compression.h +++ b/velox/dwio/dwrf/common/Compression.h @@ -60,7 +60,7 @@ inline CompressionOptions getDwrfOrcCompressionOptions( * @param config The compression options to use */ inline std::unique_ptr createCompressor( - common::CompressionKind kind, + velox::common::CompressionKind kind, CompressionBufferPool& bufferPool, dwio::common::DataBufferHolder& bufferHolder, const Config& config, @@ -72,7 +72,7 @@ inline std::unique_ptr createCompressor( config.get(Config::ZSTD_COMPRESSION_LEVEL)); auto compressor = createCompressor(kind, dwrfOrcCompressionOptions); if (!compressor) { - if (!encrypter && kind == common::CompressionKind::CompressionKind_NONE) { + if (!encrypter && kind == velox::common::CompressionKind::CompressionKind_NONE) { return std::make_unique(bufferHolder); } } @@ -86,14 +86,14 @@ inline std::unique_ptr createCompressor( } inline CompressionOptions getDwrfOrcDecompressionOptions( - common::CompressionKind kind) { + velox::common::CompressionKind kind) { CompressionOptions options{}; - if (kind == common::CompressionKind_ZLIB || - kind == common::CompressionKind_GZIP) { + if (kind == velox::common::CompressionKind_ZLIB || + kind == velox::common::CompressionKind_GZIP) { options.format.zlib.windowBits = Compressor::DWRF_ORC_ZLIB_WINDOW_BITS; } else if ( - kind == common::CompressionKind_LZ4 || - kind == common::CompressionKind_LZO) { + kind == velox::common::CompressionKind_LZ4 || + kind == velox::common::CompressionKind_LZO) { options.format.lz4_lzo.isHadoopFrameFormat = false; } return options; diff --git a/velox/dwio/dwrf/common/Config.cpp b/velox/dwio/dwrf/common/Config.cpp index 8ce1ad5f2155..4564377fa82f 100644 --- a/velox/dwio/dwrf/common/Config.cpp +++ b/velox/dwio/dwrf/common/Config.cpp @@ -25,9 +25,9 @@ Config::Entry Config::WRITER_VERSION( "orc.writer.version", WriterVersion_CURRENT); -Config::Entry Config::COMPRESSION( +Config::Entry Config::COMPRESSION( "hive.exec.orc.compress", - common::CompressionKind::CompressionKind_ZSTD); + velox::common::CompressionKind::CompressionKind_ZSTD); Config::Entry Config::ZLIB_COMPRESSION_LEVEL( "hive.exec.orc.compress.zlib.level", diff --git a/velox/dwio/dwrf/common/Config.h b/velox/dwio/dwrf/common/Config.h index 386321feabb5..c3bd92efe4f4 100644 --- a/velox/dwio/dwrf/common/Config.h +++ b/velox/dwio/dwrf/common/Config.h @@ -30,7 +30,7 @@ class Config : public config::ConfigBase { using Entry = config::ConfigBase::Entry; static Entry WRITER_VERSION; - static Entry COMPRESSION; + static Entry COMPRESSION; static Entry ZLIB_COMPRESSION_LEVEL; static Entry ZSTD_COMPRESSION_LEVEL; static Entry COMPRESSION_BLOCK_SIZE; diff --git a/velox/dwio/dwrf/common/FileMetadata.cpp b/velox/dwio/dwrf/common/FileMetadata.cpp index ccb9f6faa7f5..718043eca8a3 100644 --- a/velox/dwio/dwrf/common/FileMetadata.cpp +++ b/velox/dwio/dwrf/common/FileMetadata.cpp @@ -17,7 +17,7 @@ namespace facebook::velox::dwrf { namespace detail { -using common::CompressionKind; +using velox::common::CompressionKind; CompressionKind orcCompressionToCompressionKind( proto::orc::CompressionKind compression) { @@ -112,7 +112,7 @@ TypeKind TypeWrapper::kind() const { common::CompressionKind PostScript::compression() const { return format_ == DwrfFormat::kDwrf - ? static_cast(dwrfPtr()->compression()) + ? static_cast(dwrfPtr()->compression()) : detail::orcCompressionToCompressionKind(orcPtr()->compression()); } diff --git a/velox/dwio/dwrf/common/FileMetadata.h b/velox/dwio/dwrf/common/FileMetadata.h index 87e8c12719fe..60fdc68951bf 100644 --- a/velox/dwio/dwrf/common/FileMetadata.h +++ b/velox/dwio/dwrf/common/FileMetadata.h @@ -98,7 +98,7 @@ class PostScript { : orcPtr()->has_compression(); } - common::CompressionKind compression() const; + velox::common::CompressionKind compression() const; bool hasCompressionBlockSize() const { return format_ == DwrfFormat::kDwrf ? dwrfPtr()->has_compressionblocksize() diff --git a/velox/dwio/dwrf/common/FloatingPointDecoder.h b/velox/dwio/dwrf/common/FloatingPointDecoder.h index 7261371cb5ce..2eda15650610 100644 --- a/velox/dwio/dwrf/common/FloatingPointDecoder.h +++ b/velox/dwio/dwrf/common/FloatingPointDecoder.h @@ -103,7 +103,7 @@ class FloatingPointDecoder { template void fastPath(const uint64_t* nulls, Visitor& visitor) { constexpr bool hasFilter = - !std::is_same_v; + !std::is_same_v; constexpr bool filterOnly = std::is_same_v; constexpr bool hasHook = diff --git a/velox/dwio/dwrf/common/IntEncoder.h b/velox/dwio/dwrf/common/IntEncoder.h index 1051967bb39b..e022b06cc479 100644 --- a/velox/dwio/dwrf/common/IntEncoder.h +++ b/velox/dwio/dwrf/common/IntEncoder.h @@ -56,7 +56,7 @@ class IntEncoder { */ virtual uint64_t add( const int64_t* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) { return addImpl(data, ranges, nulls); } @@ -68,28 +68,28 @@ class IntEncoder { // to unsigned then to int64_t. virtual uint64_t add( const int32_t* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) { return addImpl(data, ranges, nulls); } virtual uint64_t add( const uint32_t* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) { return addImpl(data, ranges, nulls); } virtual uint64_t add( const int16_t* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) { return addImpl(data, ranges, nulls); } virtual uint64_t add( const uint16_t* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) { return addImpl(data, ranges, nulls); } @@ -204,7 +204,7 @@ class IntEncoder { private: template uint64_t - addImpl(const T* data, const common::Ranges& ranges, const uint64_t* nulls); + addImpl(const T* data, const velox::common::Ranges& ranges, const uint64_t* nulls); FOLLY_ALWAYS_INLINE void writeBuffer(char* start, char* end) { int32_t valsToWrite = end - start; @@ -273,7 +273,7 @@ template template uint64_t IntEncoder::addImpl( const T* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) { if (!useVInts_) { WRITE_INTS(writeLongLE); diff --git a/velox/dwio/dwrf/common/RLEv1.h b/velox/dwio/dwrf/common/RLEv1.h index 082b57c10e08..9c10aa79ae78 100644 --- a/velox/dwio/dwrf/common/RLEv1.h +++ b/velox/dwio/dwrf/common/RLEv1.h @@ -48,35 +48,35 @@ class RleEncoderV1 : public IntEncoder { /// can support uint64_t overload. uint64_t add( const int64_t* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) override { return addImpl(data, ranges, nulls); } uint64_t add( const int32_t* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) override { return addImpl(data, ranges, nulls); } uint64_t add( const uint32_t* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) override { return addImpl(data, ranges, nulls); } uint64_t add( const int16_t* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) override { return addImpl(data, ranges, nulls); } uint64_t add( const uint16_t* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) override { return addImpl(data, ranges, nulls); } @@ -155,7 +155,7 @@ class RleEncoderV1 : public IntEncoder { template uint64_t - addImpl(const T* data, const common::Ranges& ranges, const uint64_t* nulls); + addImpl(const T* data, const velox::common::Ranges& ranges, const uint64_t* nulls); template FOLLY_ALWAYS_INLINE bool isRunRepeating(const Integer& value) { @@ -216,7 +216,7 @@ template template uint64_t RleEncoderV1::addImpl( const T* data, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, const uint64_t* nulls) { uint64_t count = 0; if (nulls) { @@ -314,7 +314,7 @@ class RleDecoderV1 : public dwio::common::IntDecoder { template void fastPath(const uint64_t* nulls, Visitor& visitor) { constexpr bool hasFilter = - !std::is_same_v; + !std::is_same_v; constexpr bool hasHook = !std::is_same_v; auto rows = visitor.rows(); diff --git a/velox/dwio/dwrf/reader/DwrfData.cpp b/velox/dwio/dwrf/reader/DwrfData.cpp index 509791b44d12..f8a116f72b87 100644 --- a/velox/dwio/dwrf/reader/DwrfData.cpp +++ b/velox/dwio/dwrf/reader/DwrfData.cpp @@ -150,7 +150,7 @@ void DwrfData::readNulls( } void DwrfData::filterRowGroups( - const common::ScanSpec& scanSpec, + const velox::common::ScanSpec& scanSpec, uint64_t rowGroupSize, const dwio::common::StatsContext& writerContext, FilterRowGroupsResult& result) { diff --git a/velox/dwio/dwrf/reader/DwrfData.h b/velox/dwio/dwrf/reader/DwrfData.h index 5c17cd77aafc..1bcdb31ec703 100644 --- a/velox/dwio/dwrf/reader/DwrfData.h +++ b/velox/dwio/dwrf/reader/DwrfData.h @@ -51,7 +51,7 @@ class DwrfData : public dwio::common::FormatData { } void filterRowGroups( - const common::ScanSpec& scanSpec, + const velox::common::ScanSpec& scanSpec, uint64_t rowsPerRowGroup, const dwio::common::StatsContext& writerContext, FilterRowGroupsResult& result) override; @@ -132,7 +132,7 @@ class DwrfParams : public dwio::common::FormatParams { std::unique_ptr toFormatData( const std::shared_ptr& type, - const common::ScanSpec& /*scanSpec*/) override { + const velox::common::ScanSpec& /*scanSpec*/) override { return std::make_unique( type, stripeStreams_, streamLabels_, flatMapContext_); } diff --git a/velox/dwio/dwrf/reader/DwrfReader.cpp b/velox/dwio/dwrf/reader/DwrfReader.cpp index 72fb486888a4..12df5d25b704 100644 --- a/velox/dwio/dwrf/reader/DwrfReader.cpp +++ b/velox/dwio/dwrf/reader/DwrfReader.cpp @@ -998,7 +998,7 @@ uint64_t DwrfReader::getMemoryUse( // Decompressors need buffers for each stream uint64_t decompressorMemoryBytes = 0; const auto compressionKind = readerBase.compressionKind(); - if (compressionKind != common::CompressionKind_NONE) { + if (compressionKind != velox::common::CompressionKind_NONE) { for (int32_t i = 0; i < fileFooter.typesSize(); ++i) { if (cs.shouldReadNode(i)) { const auto type = fileFooter.types(i); @@ -1006,7 +1006,7 @@ uint64_t DwrfReader::getMemoryUse( maxStreamsForType(type) * readerBase.compressionBlockSize(); } } - if (compressionKind == common::CompressionKind_SNAPPY) { + if (compressionKind == velox::common::CompressionKind_SNAPPY) { decompressorMemoryBytes *= 2; // Snappy decompressor uses a second buffer } } diff --git a/velox/dwio/dwrf/reader/DwrfReader.h b/velox/dwio/dwrf/reader/DwrfReader.h index 8fecb3f90f76..3df2e366e586 100644 --- a/velox/dwio/dwrf/reader/DwrfReader.h +++ b/velox/dwio/dwrf/reader/DwrfReader.h @@ -232,7 +232,7 @@ class DwrfReader : public dwio::common::Reader { ~DwrfReader() override = default; - common::CompressionKind getCompression() const { + velox::common::CompressionKind getCompression() const { return readerBase_->compressionKind(); } diff --git a/velox/dwio/dwrf/reader/ReaderBase.cpp b/velox/dwio/dwrf/reader/ReaderBase.cpp index 16bd779b500a..1628b7362f5f 100644 --- a/velox/dwio/dwrf/reader/ReaderBase.cpp +++ b/velox/dwio/dwrf/reader/ReaderBase.cpp @@ -351,7 +351,7 @@ std::shared_ptr ReaderBase::convertType( DECIMAL(type.getOrcPtr()->precision(), type.getOrcPtr()->scale()); } else { converted = BIGINT(); - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::dwrf::ReaderBase::convertType", &converted); } return converted; diff --git a/velox/dwio/dwrf/reader/ReaderBase.h b/velox/dwio/dwrf/reader/ReaderBase.h index 561d88e56841..2872ef771914 100644 --- a/velox/dwio/dwrf/reader/ReaderBase.h +++ b/velox/dwio/dwrf/reader/ReaderBase.h @@ -177,13 +177,13 @@ class ReaderBase { uint64_t compressionBlockSize() const { return postScript_->hasCompressionBlockSize() ? postScript_->compressionBlockSize() - : common::kDefaultCompressionBlockSize; + : velox::common::kDefaultCompressionBlockSize; } - common::CompressionKind compressionKind() const { + velox::common::CompressionKind compressionKind() const { return postScript_->hasCompressionBlockSize() ? postScript_->compression() - : common::CompressionKind::CompressionKind_NONE; + : velox::common::CompressionKind::CompressionKind_NONE; } WriterVersion writerVersion() const { diff --git a/velox/dwio/dwrf/reader/SelectiveByteRleColumnReader.h b/velox/dwio/dwrf/reader/SelectiveByteRleColumnReader.h index 09d43dc9cae4..8699b72a1897 100644 --- a/velox/dwio/dwrf/reader/SelectiveByteRleColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveByteRleColumnReader.h @@ -29,7 +29,7 @@ class SelectiveByteRleColumnReader const TypePtr& requestedType, std::shared_ptr fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, bool isBool) : dwio::common::SelectiveByteRleColumnReader( requestedType, diff --git a/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.cpp index ba14222b1dc5..5773d2b894b6 100644 --- a/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.cpp @@ -22,7 +22,7 @@ template SelectiveDecimalColumnReader::SelectiveDecimalColumnReader( const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec) + velox::common::ScanSpec& scanSpec) : SelectiveColumnReader(fileType->type(), fileType, params, scanSpec) { EncodingKey encodingKey{fileType_->id(), params.flatMapContext().sequence}; auto& stripe = params.stripeStreams(); @@ -76,13 +76,13 @@ void SelectiveDecimalColumnReader::seekToRowGroup(int64_t index) { template template void SelectiveDecimalColumnReader::readHelper( - common::Filter* filter, + velox::common::Filter* filter, RowSet rows) { ExtractToReader extractValues(this); - common::AlwaysTrue alwaysTrue; + velox::common::AlwaysTrue alwaysTrue; DirectRleColumnVisitor< int64_t, - common::AlwaysTrue, + velox::common::AlwaysTrue, decltype(extractValues), kDense> visitor(alwaysTrue, this, rows, extractValues); @@ -110,7 +110,7 @@ void SelectiveDecimalColumnReader::readHelper( // decode value stream facebook::velox::dwio::common:: - ColumnVisitor + ColumnVisitor valueVisitor(alwaysTrue, this, rows, extractValues); decodeWithVisitor>(valueDecoder_.get(), valueVisitor); readOffset_ += numRows; @@ -164,7 +164,7 @@ void SelectiveDecimalColumnReader::processNulls( template void SelectiveDecimalColumnReader::processFilter( - const common::Filter* filter, + const velox::common::Filter* filter, const RowSet& rows, const uint64_t* rawNulls) { VELOX_CHECK_NOT_NULL(filter, "Filter must not be null."); @@ -205,36 +205,36 @@ void SelectiveDecimalColumnReader::processFilter( template void SelectiveDecimalColumnReader::process( - const common::Filter* filter, + const velox::common::Filter* filter, const RowSet& rows, const uint64_t* rawNulls) { // Treat the filter as kAlwaysTrue if any of the following conditions are met: // 1) No filter found; // 2) Filter is kIsNotNull but rawNulls == NULL (no elements is null). auto filterKind = - !filter || (filter->kind() == common::FilterKind::kIsNotNull && !rawNulls) - ? common::FilterKind::kAlwaysTrue + !filter || (filter->kind() == velox::common::FilterKind::kIsNotNull && !rawNulls) + ? velox::common::FilterKind::kAlwaysTrue : filter->kind(); switch (filterKind) { - case common::FilterKind::kAlwaysTrue: + case velox::common::FilterKind::kAlwaysTrue: // Simply add all rows to output. for (vector_size_t i = 0; i < numValues_; i++) { addOutputRow(rows[i]); } break; - case common::FilterKind::kIsNull: + case velox::common::FilterKind::kIsNull: processNulls(true, rows, rawNulls); break; - case common::FilterKind::kIsNotNull: + case velox::common::FilterKind::kIsNotNull: processNulls(false, rows, rawNulls); break; - case common::FilterKind::kBigintRange: - case common::FilterKind::kBigintValuesUsingHashTable: - case common::FilterKind::kBigintValuesUsingBitmask: - case common::FilterKind::kNegatedBigintRange: - case common::FilterKind::kNegatedBigintValuesUsingHashTable: - case common::FilterKind::kNegatedBigintValuesUsingBitmask: - case common::FilterKind::kBigintMultiRange: { + case velox::common::FilterKind::kBigintRange: + case velox::common::FilterKind::kBigintValuesUsingHashTable: + case velox::common::FilterKind::kBigintValuesUsingBitmask: + case velox::common::FilterKind::kNegatedBigintRange: + case velox::common::FilterKind::kNegatedBigintValuesUsingHashTable: + case velox::common::FilterKind::kNegatedBigintValuesUsingBitmask: + case velox::common::FilterKind::kBigintMultiRange: { if constexpr (std::is_same_v) { processFilter(filter, rows, rawNulls); } else { @@ -245,8 +245,8 @@ void SelectiveDecimalColumnReader::process( } break; } - case common::FilterKind::kHugeintValuesUsingHashTable: - case common::FilterKind::kHugeintRange: { + case velox::common::FilterKind::kHugeintValuesUsingHashTable: + case velox::common::FilterKind::kHugeintRange: { if constexpr (std::is_same_v) { processFilter(filter, rows, rawNulls); } else { diff --git a/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.h b/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.h index 4482ef47fc50..35b8872565c5 100644 --- a/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.h @@ -31,7 +31,7 @@ class SelectiveDecimalColumnReader : public SelectiveColumnReader { SelectiveDecimalColumnReader( const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec); + velox::common::ScanSpec& scanSpec); bool hasBulkPath() const override { // Only ORC uses RLEv2 encoding. Currently, ORC decimal data does not @@ -49,20 +49,20 @@ class SelectiveDecimalColumnReader : public SelectiveColumnReader { private: template - void readHelper(common::Filter* filter, RowSet rows); + void readHelper(velox::common::Filter* filter, RowSet rows); // Process IsNull and IsNotNull filters. void processNulls(bool isNull, const RowSet& rows, const uint64_t* rawNulls); // Process filters on decimal values. void processFilter( - const common::Filter* filter, + const velox::common::Filter* filter, const RowSet& rows, const uint64_t* rawNulls); // Dispatch to the respective filter processing based on the filter type. void process( - const common::Filter* filter, + const velox::common::Filter* filter, const RowSet& rows, const uint64_t* rawNulls); diff --git a/velox/dwio/dwrf/reader/SelectiveDwrfReader.cpp b/velox/dwio/dwrf/reader/SelectiveDwrfReader.cpp index 3306d7917ac5..6457e58636ef 100644 --- a/velox/dwio/dwrf/reader/SelectiveDwrfReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveDwrfReader.cpp @@ -38,7 +38,7 @@ std::unique_ptr buildIntegerReader( const std::shared_ptr& fileType, DwrfParams& params, uint32_t numBytes, - common::ScanSpec& scanSpec) { + velox::common::ScanSpec& scanSpec) { const EncodingKey encodingKey{ fileType->id(), params.flatMapContext().sequence}; auto& stripe = params.stripeStreams(); @@ -63,7 +63,7 @@ std::unique_ptr SelectiveDwrfReader::build( const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, bool useColumnNames, bool isRoot) { VELOX_CHECK( diff --git a/velox/dwio/dwrf/reader/SelectiveDwrfReader.h b/velox/dwio/dwrf/reader/SelectiveDwrfReader.h index a787a6a10e63..faa48f4d89b9 100644 --- a/velox/dwio/dwrf/reader/SelectiveDwrfReader.h +++ b/velox/dwio/dwrf/reader/SelectiveDwrfReader.h @@ -29,7 +29,7 @@ class SelectiveDwrfReader { const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, bool useColumnNames, bool isRoot = false); @@ -41,7 +41,7 @@ class SelectiveDwrfReader { StripeStreams& stripe, const StreamLabels& streamLabels, dwio::common::ColumnReaderStatistics& stats, - common::ScanSpec* scanSpec, + velox::common::ScanSpec* scanSpec, bool useColumnNames, FlatMapContext flatMapContext = {}, bool isRoot = false) { diff --git a/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp index 8afbf64e90b4..d48b9b8be2ed 100644 --- a/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp @@ -70,7 +70,7 @@ std::vector> getKeyNodes( const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, bool asStruct, bool useColumnNames) { using namespace dwio::common::flatmap; @@ -82,14 +82,14 @@ std::vector> getKeyNodes( auto& dataValueType = fileType->childAt(1); auto& stripe = params.stripeStreams(); - common::ScanSpec* keysSpec = nullptr; - common::ScanSpec* valuesSpec = nullptr; - std::unordered_map, common::ScanSpec*, KeyValueHash> + velox::common::ScanSpec* keysSpec = nullptr; + velox::common::ScanSpec* valuesSpec = nullptr; + std::unordered_map, velox::common::ScanSpec*, KeyValueHash> childSpecs; if (!asStruct) { - keysSpec = scanSpec.getOrCreateChild(common::ScanSpec::kMapKeysFieldName); + keysSpec = scanSpec.getOrCreateChild(velox::common::ScanSpec::kMapKeysFieldName); valuesSpec = - scanSpec.getOrCreateChild(common::ScanSpec::kMapValuesFieldName); + scanSpec.getOrCreateChild(velox::common::ScanSpec::kMapValuesFieldName); VELOX_CHECK(!valuesSpec->hasFilter()); keysSpec->setProjectOut(true); valuesSpec->setProjectOut(true); @@ -118,7 +118,7 @@ std::vector> getKeyNodes( EncodingKey seqEk(dataValueType->id(), sequence); const auto& keyInfo = stripe.getEncoding(seqEk).key(); auto key = extractKey(keyInfo); - common::ScanSpec* childSpec; + velox::common::ScanSpec* childSpec; if (auto it = childSpecs.find(key); it != childSpecs.end() && !it->second->isConstant()) { childSpec = it->second; @@ -174,7 +174,7 @@ class SelectiveFlatMapAsStructReader : public SelectiveStructColumnReaderBase { const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, bool useColumnNames) : SelectiveStructColumnReaderBase( requestedType, @@ -213,7 +213,7 @@ class SelectiveFlatMapReader : public SelectiveStructColumnReaderBase { const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, bool useColumnNames) : SelectiveStructColumnReaderBase( requestedType, @@ -250,7 +250,7 @@ std::unique_ptr createReader( const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, bool useColumnNames) { if (scanSpec.isFlatMapAsStruct()) { return std::make_unique>( @@ -268,7 +268,7 @@ createSelectiveFlatMapColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, bool useColumnNames) { auto kind = fileType->childAt(0)->type()->kind(); switch (kind) { diff --git a/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.h b/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.h index e12936d16510..c7cde893a535 100644 --- a/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.h @@ -26,7 +26,7 @@ createSelectiveFlatMapColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams&, - common::ScanSpec&, + velox::common::ScanSpec&, bool useColumnNames); } // namespace facebook::velox::dwrf diff --git a/velox/dwio/dwrf/reader/SelectiveFloatingPointColumnReader.h b/velox/dwio/dwrf/reader/SelectiveFloatingPointColumnReader.h index 7768819436b9..86455d58d63c 100644 --- a/velox/dwio/dwrf/reader/SelectiveFloatingPointColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveFloatingPointColumnReader.h @@ -35,7 +35,7 @@ class SelectiveFloatingPointColumnReader const TypePtr& requestedType, std::shared_ptr fileType, DwrfParams& params, - common::ScanSpec& scanSpec); + velox::common::ScanSpec& scanSpec); void seekToRowGroup(int64_t index) override { base::seekToRowGroup(index); @@ -65,7 +65,7 @@ SelectiveFloatingPointColumnReader:: const TypePtr& requestedType, std::shared_ptr fileType, DwrfParams& params, - common::ScanSpec& scanSpec) + velox::common::ScanSpec& scanSpec) : dwio::common::SelectiveFloatingPointColumnReader( requestedType, std::move(fileType), diff --git a/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.cpp index c2a4edc130ab..8dc248c598bb 100644 --- a/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.cpp @@ -25,7 +25,7 @@ SelectiveIntegerDictionaryColumnReader::SelectiveIntegerDictionaryColumnReader( const TypePtr& requestedType, std::shared_ptr fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, uint32_t numBytes) : SelectiveIntegerColumnReader( requestedType, diff --git a/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.h b/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.h index 2e6d6e091221..d4a6113a5644 100644 --- a/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.h @@ -31,7 +31,7 @@ class SelectiveIntegerDictionaryColumnReader const TypePtr& requestedType, std::shared_ptr fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, uint32_t numBytes); void seekToRowGroup(int64_t index) override { diff --git a/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.h b/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.h index eb2b3038a2da..a29326ffbe3c 100644 --- a/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.h @@ -32,7 +32,7 @@ class SelectiveIntegerDirectColumnReader std::shared_ptr fileType, DwrfParams& params, uint32_t numBytes, - common::ScanSpec& scanSpec) + velox::common::ScanSpec& scanSpec) : SelectiveIntegerColumnReader( requestedType, params, diff --git a/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.cpp index 06e2a975393e..981be4d8ea1a 100644 --- a/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.cpp @@ -52,7 +52,7 @@ SelectiveListColumnReader::SelectiveListColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, bool useColumnNames) : dwio::common::SelectiveListColumnReader( requestedType, @@ -66,7 +66,7 @@ SelectiveListColumnReader::SelectiveListColumnReader( // count the number of selected sub-columns auto& childType = requestedType_->childAt(0); if (scanSpec_->children().empty()) { - scanSpec.getOrCreateChild(common::ScanSpec::kArrayElementsFieldName); + scanSpec.getOrCreateChild(velox::common::ScanSpec::kArrayElementsFieldName); } scanSpec_->children()[0]->setProjectOut(true); @@ -88,7 +88,7 @@ SelectiveMapColumnReader::SelectiveMapColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, bool useColumnNames) : dwio::common::SelectiveMapColumnReader( requestedType, @@ -101,8 +101,8 @@ SelectiveMapColumnReader::SelectiveMapColumnReader( fileType_->id(), params.flatMapContext().sequence}; auto& stripe = params.stripeStreams(); if (scanSpec_->children().empty()) { - scanSpec_->getOrCreateChild(common::ScanSpec::kMapKeysFieldName); - scanSpec_->getOrCreateChild(common::ScanSpec::kMapValuesFieldName); + scanSpec_->getOrCreateChild(velox::common::ScanSpec::kMapKeysFieldName); + scanSpec_->getOrCreateChild(velox::common::ScanSpec::kMapValuesFieldName); } scanSpec_->children()[0]->setProjectOut(true); scanSpec_->children()[1]->setProjectOut(true); diff --git a/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.h b/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.h index 7149190ae54c..fa5ef37b2286 100644 --- a/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.h @@ -31,7 +31,7 @@ class SelectiveListColumnReader const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, bool useColumnNames); void resetFilterCaches() override { @@ -65,7 +65,7 @@ class SelectiveMapColumnReader : public dwio::common::SelectiveMapColumnReader { const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, bool useColumnNames); void resetFilterCaches() override { diff --git a/velox/dwio/dwrf/reader/SelectiveStringDictionaryColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveStringDictionaryColumnReader.cpp index d88b8c24e404..a029f763be60 100644 --- a/velox/dwio/dwrf/reader/SelectiveStringDictionaryColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveStringDictionaryColumnReader.cpp @@ -25,7 +25,7 @@ using namespace dwio::common; SelectiveStringDictionaryColumnReader::SelectiveStringDictionaryColumnReader( const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec) + velox::common::ScanSpec& scanSpec) : SelectiveColumnReader(fileType->type(), fileType, params, scanSpec), lastStrideIndex_(-1), provider_(params.stripeStreams().getStrideIndexProvider()), diff --git a/velox/dwio/dwrf/reader/SelectiveStringDictionaryColumnReader.h b/velox/dwio/dwrf/reader/SelectiveStringDictionaryColumnReader.h index d524f3c3f9bd..f5f7cb751a07 100644 --- a/velox/dwio/dwrf/reader/SelectiveStringDictionaryColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveStringDictionaryColumnReader.h @@ -30,7 +30,7 @@ class SelectiveStringDictionaryColumnReader SelectiveStringDictionaryColumnReader( const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec); + velox::common::ScanSpec& scanSpec); bool hasBulkPath() const override { // Only ORC uses RLEv2 encoding. Currently, ORC string data does not diff --git a/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.cpp index 657470e92676..6ac31ffcdc83 100644 --- a/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.cpp @@ -25,7 +25,7 @@ namespace facebook::velox::dwrf { SelectiveStringDirectColumnReader::SelectiveStringDirectColumnReader( const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec) + velox::common::ScanSpec& scanSpec) : SelectiveColumnReader(fileType->type(), fileType, params, scanSpec) { EncodingKey encodingKey{fileType->id(), params.flatMapContext().sequence}; auto& stripe = params.stripeStreams(); @@ -195,7 +195,7 @@ bool SelectiveStringDirectColumnReader::try8ConsecutiveSmall( int startRow) { #ifndef NDEBUG bool testCoverage[] = {kScatter, kGreaterThan4}; - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::dwrf::SelectiveStringDirectColumnReader::try8ConsecutiveSmall", testCoverage); #endif @@ -428,7 +428,7 @@ void SelectiveStringDirectColumnReader::readWithVisitor( TVisitor visitor) { int32_t current = visitor.start(); constexpr bool isExtract = - std::is_same_v && + std::is_same_v && std::is_same_v; auto nulls = nullsInReadRange_ ? nullsInReadRange_->as() : nullptr; diff --git a/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.h b/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.h index 8da1e77401d2..1c1ec34dc19f 100644 --- a/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.h @@ -28,7 +28,7 @@ class SelectiveStringDirectColumnReader SelectiveStringDirectColumnReader( const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec); + velox::common::ScanSpec& scanSpec); void seekToRowGroup(int64_t index) override { SelectiveColumnReader::seekToRowGroup(index); diff --git a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp index 03874680e983..a86c70084656 100644 --- a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp @@ -27,7 +27,7 @@ SelectiveStructColumnReader::SelectiveStructColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, bool useColumnNames, bool isRoot) : SelectiveStructColumnReaderBase( diff --git a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.h b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.h index 4d28b692b379..d375453ddc5c 100644 --- a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.h @@ -28,7 +28,7 @@ class SelectiveStructColumnReaderBase const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, bool useColumnNames, bool isRoot = false) : dwio::common::SelectiveStructColumnReaderBase( @@ -86,7 +86,7 @@ class SelectiveStructColumnReader : public SelectiveStructColumnReaderBase { const TypePtr& requestedType, const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, bool useColumnNames, bool isRoot = false); diff --git a/velox/dwio/dwrf/reader/SelectiveTimestampColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveTimestampColumnReader.cpp index 99f790851e97..1b730949748e 100644 --- a/velox/dwio/dwrf/reader/SelectiveTimestampColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveTimestampColumnReader.cpp @@ -25,7 +25,7 @@ using namespace dwio::common; SelectiveTimestampColumnReader::SelectiveTimestampColumnReader( const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec) + velox::common::ScanSpec& scanSpec) : SelectiveColumnReader(fileType->type(), fileType, params, scanSpec), precision_( params.stripeStreams().rowReaderOptions().timestampPrecision()) { @@ -100,13 +100,13 @@ void SelectiveTimestampColumnReader::read( template void SelectiveTimestampColumnReader::readHelper( - common::Filter* filter, + velox::common::Filter* filter, const RowSet& rows) { ExtractToReader extractValues(this); - common::AlwaysTrue alwaysTrue; + velox::common::AlwaysTrue alwaysTrue; DirectRleColumnVisitor< int64_t, - common::AlwaysTrue, + velox::common::AlwaysTrue, decltype(extractValues), isDense> visitor(alwaysTrue, this, rows, extractValues); @@ -178,23 +178,23 @@ void SelectiveTimestampColumnReader::readHelper( // 1) No filter found; // 2) Filter is kIsNotNull but rawNulls==NULL (no elements is null). switch ( - !filter || (filter->kind() == common::FilterKind::kIsNotNull && !rawNulls) - ? common::FilterKind::kAlwaysTrue + !filter || (filter->kind() == velox::common::FilterKind::kIsNotNull && !rawNulls) + ? velox::common::FilterKind::kAlwaysTrue : filter->kind()) { - case common::FilterKind::kAlwaysTrue: + case velox::common::FilterKind::kAlwaysTrue: // Simply add all rows to output. for (vector_size_t i = 0; i < numValues_; i++) { addOutputRow(rows[i]); } break; - case common::FilterKind::kIsNull: + case velox::common::FilterKind::kIsNull: processNulls(true, rows, rawNulls); break; - case common::FilterKind::kIsNotNull: + case velox::common::FilterKind::kIsNotNull: processNulls(false, rows, rawNulls); break; - case common::FilterKind::kTimestampRange: - case common::FilterKind::kMultiRange: + case velox::common::FilterKind::kTimestampRange: + case velox::common::FilterKind::kMultiRange: processFilter(filter, rows, rawNulls); break; default: @@ -234,7 +234,7 @@ void SelectiveTimestampColumnReader::processNulls( } void SelectiveTimestampColumnReader::processFilter( - const common::Filter* filter, + const velox::common::Filter* filter, const RowSet& rows, const uint64_t* rawNulls) { auto rawTs = values_->asMutable(); diff --git a/velox/dwio/dwrf/reader/SelectiveTimestampColumnReader.h b/velox/dwio/dwrf/reader/SelectiveTimestampColumnReader.h index 44ba1feb113d..1210b9030d82 100644 --- a/velox/dwio/dwrf/reader/SelectiveTimestampColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveTimestampColumnReader.h @@ -30,7 +30,7 @@ class SelectiveTimestampColumnReader SelectiveTimestampColumnReader( const std::shared_ptr& fileType, DwrfParams& params, - common::ScanSpec& scanSpec); + velox::common::ScanSpec& scanSpec); void seekToRowGroup(int64_t index) override; uint64_t skip(uint64_t numValues) override; @@ -42,12 +42,12 @@ class SelectiveTimestampColumnReader private: template - void readHelper(common::Filter* filter, const RowSet& rows); + void readHelper(velox::common::Filter* filter, const RowSet& rows); void processNulls(const bool isNull, const RowSet& rows, const uint64_t* rawNulls); void processFilter( - const common::Filter* filter, + const velox::common::Filter* filter, const RowSet& rows, const uint64_t* rawNulls); diff --git a/velox/dwio/dwrf/test/ColumnWriterIndexTest.cpp b/velox/dwio/dwrf/test/ColumnWriterIndexTest.cpp index a8abecaed2fb..0b4a29a4962b 100644 --- a/velox/dwio/dwrf/test/ColumnWriterIndexTest.cpp +++ b/velox/dwio/dwrf/test/ColumnWriterIndexTest.cpp @@ -338,7 +338,7 @@ class WriterEncodingIndexTest2 { // Indices are captured the same way for all stripes in the derived tests. for (size_t j = 0; j != stripeCount; ++j) { for (size_t i = 0; i != pageCount; ++i) { - columnWriter->write(batch, common::Ranges::of(0, 1000)); + columnWriter->write(batch, velox::common::Ranges::of(0, 1000)); for (auto n = 0; n < mocks.size(); ++n) { EXPECT_CALL(*mocks.at(n), addEntry(_)) .WillOnce(Invoke([&, k = n](const StatisticsBuilder& builder) { @@ -774,7 +774,7 @@ class IntegerColumnWriterDirectEncodingIndexTest : public testing::Test { break; } } - columnWriter->write(batch, common::Ranges::of(0, 1000)); + columnWriter->write(batch, velox::common::Ranges::of(0, 1000)); EXPECT_CALL(*mockIndexBuilderPtr, addEntry(_)) .WillOnce(Invoke([&](const StatisticsBuilder& builder) { auto stats = builder.build(); @@ -789,7 +789,7 @@ class IntegerColumnWriterDirectEncodingIndexTest : public testing::Test { // The rest of the strides are all written directly in direct encoding. for (size_t i = currentPage + 1; i < pageCount; ++i) { - columnWriter->write(batch, common::Ranges::of(0, 1000)); + columnWriter->write(batch, velox::common::Ranges::of(0, 1000)); if (abandonDict_) { if (callAbandonDict(j, i)) { // These calls should essentially be no-ops. @@ -827,7 +827,7 @@ class IntegerColumnWriterDirectEncodingIndexTest : public testing::Test { }); } else { for (size_t i = 0; i != pageCount; ++i) { - columnWriter->write(batch, common::Ranges::of(0, 1000)); + columnWriter->write(batch, velox::common::Ranges::of(0, 1000)); if (abandonDict_) { if (callAbandonDict(j, i)) { // These calls should essentially be no-ops. @@ -950,7 +950,7 @@ class StringColumnWriterDictionaryEncodingIndexTest : public testing::Test { // encoding. for (size_t j = 0; j != stripeCount; ++j) { for (size_t i = 0; i != pageCount; ++i) { - columnWriter->write(batch, common::Ranges::of(0, 1000)); + columnWriter->write(batch, velox::common::Ranges::of(0, 1000)); EXPECT_CALL(*mockIndexBuilderPtr, addEntry(_)) .WillOnce(Invoke([&](const StatisticsBuilder& builder) { auto stats = builder.build(); @@ -1081,7 +1081,7 @@ class StringColumnWriterDirectEncodingIndexTest : public testing::Test { break; } } - columnWriter->write(batch, common::Ranges::of(0, 1000)); + columnWriter->write(batch, velox::common::Ranges::of(0, 1000)); EXPECT_CALL(*mockIndexBuilderPtr, addEntry(_)) .WillOnce(Invoke([&](const StatisticsBuilder& builder) { auto stats = builder.build(); @@ -1096,7 +1096,7 @@ class StringColumnWriterDirectEncodingIndexTest : public testing::Test { // The rest of the strides are all written directly in direct encoding. for (size_t i = currentPage + 1; i < pageCount; ++i) { - columnWriter->write(batch, common::Ranges::of(0, 1000)); + columnWriter->write(batch, velox::common::Ranges::of(0, 1000)); if (abandonDict_) { if (callAbandonDict(j, i)) { // These calls should essentially be no-ops. @@ -1134,7 +1134,7 @@ class StringColumnWriterDirectEncodingIndexTest : public testing::Test { }); } else { for (size_t i = 0; i != pageCount; ++i) { - columnWriter->write(batch, common::Ranges::of(0, 1000)); + columnWriter->write(batch, velox::common::Ranges::of(0, 1000)); if (abandonDict_) { if (callAbandonDict(j, i)) { // These calls should essentially be no-ops. diff --git a/velox/dwio/dwrf/test/ColumnWriterTest.cpp b/velox/dwio/dwrf/test/ColumnWriterTest.cpp index 2cf83e888804..2721c8ff95b4 100644 --- a/velox/dwio/dwrf/test/ColumnWriterTest.cpp +++ b/velox/dwio/dwrf/test/ColumnWriterTest.cpp @@ -356,7 +356,7 @@ void testDataTypeWriter( for (auto stripeI = 0; stripeI < stripeCount; ++stripeI) { proto::StripeFooter sf; for (auto strideI = 0; strideI < strideCount; ++strideI) { - writer->write(batch, common::Ranges::of(0, size)); + writer->write(batch, velox::common::Ranges::of(0, size)); writer->createIndexEntry(); } writer->flush([&sf](uint32_t /* unused */) -> proto::ColumnEncoding& { @@ -1020,7 +1020,7 @@ void testMapWriter( toWrite = wrapInDictionary(toWrite, strideI, pool); } } - writer->write(toWrite, common::Ranges::of(0, toWrite->size())); + writer->write(toWrite, velox::common::Ranges::of(0, toWrite->size())); writer->createIndexEntry(); writtenBatches.push_back(toWrite); } @@ -1157,7 +1157,7 @@ void testMapWriterRow( if (testEncoded) { toWrite = wrapInDictionaryRow(toWrite, pool); } - writer->write(toWrite, common::Ranges::of(0, toWrite->size())); + writer->write(toWrite, velox::common::Ranges::of(0, toWrite->size())); writer->createIndexEntry(); writtenBatches.push_back(toWrite); @@ -2180,7 +2180,7 @@ struct IntegerColumnWriterTypedTestCase { for (size_t i = 0; i != flushCount; ++i) { proto::StripeFooter stripeFooter; for (size_t j = 0; j != repetitionCount; ++j) { - columnWriter->write(batch, common::Ranges::of(0, batch->size())); + columnWriter->write(batch, velox::common::Ranges::of(0, batch->size())); postProcess(*columnWriter, i, j); columnWriter->createIndexEntry(); } @@ -3416,7 +3416,7 @@ struct StringColumnWriterTestCase { // Write Stride for (size_t j = 0; j != repetitionCount; ++j) { // TODO: break the batch into multiple strides. - columnWriter->write(batches[j], common::Ranges::of(0, size)); + columnWriter->write(batches[j], velox::common::Ranges::of(0, size)); postProcess(*columnWriter, i, j); columnWriter->createIndexEntry(); } @@ -4250,7 +4250,7 @@ TEST_F(ColumnWriterTest, IntDictWriterDirectValueOverflow) { auto vector = populateBatch(data, pool_.get()); auto writer = BaseColumnWriter::create(context, *typeWithId, 0); - writer->write(vector, common::Ranges::of(0, size)); + writer->write(vector, velox::common::Ranges::of(0, size)); writer->createIndexEntry(); proto::StripeFooter sf; writer->flush([&sf](auto /* unused */) -> proto::ColumnEncoding& { @@ -4295,7 +4295,7 @@ TEST_F(ColumnWriterTest, ShortDictWriterDictValueOverflow) { auto vector = populateBatch(data, pool_.get()); auto writer = BaseColumnWriter::create(context, *typeWithId, 0); - writer->write(vector, common::Ranges::of(0, size)); + writer->write(vector, velox::common::Ranges::of(0, size)); writer->createIndexEntry(); proto::StripeFooter sf; writer->flush([&sf](auto /* unused */) -> proto::ColumnEncoding& { @@ -4336,7 +4336,7 @@ TEST_F(ColumnWriterTest, RemovePresentStream) { // write auto writer = BaseColumnWriter::create(context, *typeWithId, 0); - writer->write(vector, common::Ranges::of(0, size)); + writer->write(vector, velox::common::Ranges::of(0, size)); writer->createIndexEntry(); proto::StripeFooter sf; writer->flush([&sf](auto /* unused */) -> proto::ColumnEncoding& { @@ -4374,7 +4374,7 @@ TEST_F(ColumnWriterTest, ColumnIdInStream) { // write auto writer = BaseColumnWriter::create(context, *typeWithId, 0); - writer->write(vector, common::Ranges::of(0, size)); + writer->write(vector, velox::common::Ranges::of(0, size)); writer->createIndexEntry(); proto::StripeFooter sf; writer->flush([&sf](auto /* unused */) -> proto::ColumnEncoding& { @@ -4502,7 +4502,7 @@ struct DictColumnWriterTestCase { if (writeDirect_) { writer->tryAbandonDictionaries(true); } - writer->write(batch, common::Ranges::of(0, batch->size())); + writer->write(batch, velox::common::Ranges::of(0, batch->size())); writer->createIndexEntry(); proto::StripeFooter sf; diff --git a/velox/dwio/dwrf/test/E2EWriterTest.cpp b/velox/dwio/dwrf/test/E2EWriterTest.cpp index e069d56c8644..123a133845b7 100644 --- a/velox/dwio/dwrf/test/E2EWriterTest.cpp +++ b/velox/dwio/dwrf/test/E2EWriterTest.cpp @@ -241,12 +241,12 @@ class E2EWriterTest : public testing::Test { } } - static common::SpillConfig getSpillConfig( + static velox::common::SpillConfig getSpillConfig( int32_t minSpillableReservationPct, int32_t spillableReservationGrowthPct, uint64_t writerFlushThresholdSize = 0) { static const std::string emptySpillFolder = ""; - return common::SpillConfig( + return velox::common::SpillConfig( [&]() -> const std::string& { return emptySpillFolder; }, [&](uint64_t) {}, "fakeSpillConfig", @@ -1642,7 +1642,7 @@ TEST_F(E2EWriterTest, memoryConfigError) { dwrf::WriterOptions options; options.schema = type; - const common::SpillConfig spillConfig = getSpillConfig(10, 20); + const velox::common::SpillConfig spillConfig = getSpillConfig(10, 20); options.spillConfig = &spillConfig; auto writerPool = memory::memoryManager()->addRootPool( "memoryReclaim", 1L << 30, exec::MemoryReclaimer::create()); @@ -1674,7 +1674,7 @@ DEBUG_ONLY_TEST_F(E2EWriterTest, memoryReclaimOnWrite) { for (int i = 0; i < 10; ++i) { vectors.push_back(fuzzer.fuzzInputRow(type)); } - const common::SpillConfig spillConfig = getSpillConfig(10, 20); + const velox::common::SpillConfig spillConfig = getSpillConfig(10, 20); for (bool enableReclaim : {false, true}) { SCOPED_TRACE(fmt::format("enableReclaim {}", enableReclaim)); @@ -1801,7 +1801,7 @@ DEBUG_ONLY_TEST_F(E2EWriterTest, memoryReclaimOnFlush) { for (int i = 0; i < 10; ++i) { vectors.push_back(fuzzer.fuzzInputRow(type)); } - const common::SpillConfig spillConfig = getSpillConfig(10, 20); + const velox::common::SpillConfig spillConfig = getSpillConfig(10, 20); for (bool enableReclaim : {false, true}) { SCOPED_TRACE(fmt::format("enableReclaim {}", enableReclaim)); @@ -1887,7 +1887,7 @@ TEST_F(E2EWriterTest, memoryReclaimAfterClose) { vectors.push_back(fuzzer.fuzzInputRow(type)); } - const common::SpillConfig spillConfig = getSpillConfig(10, 20); + const velox::common::SpillConfig spillConfig = getSpillConfig(10, 20); struct { bool canReclaim; bool abort; @@ -1984,7 +1984,7 @@ DEBUG_ONLY_TEST_F(E2EWriterTest, memoryReclaimDuringInit) { }, leafPool_.get()); - const common::SpillConfig spillConfig = getSpillConfig(10, 20); + const velox::common::SpillConfig spillConfig = getSpillConfig(10, 20); for (const auto& reclaimable : {false, true}) { SCOPED_TRACE(fmt::format("reclaimable {}", reclaimable)); @@ -2074,7 +2074,7 @@ DEBUG_ONLY_TEST_F(E2EWriterTest, memoryReclaimThreshold) { "writerFlushThresholdSize {}", succinctBytes(writerFlushThresholdSize))); - const common::SpillConfig spillConfig = + const velox::common::SpillConfig spillConfig = getSpillConfig(10, 20, writerFlushThresholdSize); auto config = std::make_shared(); config->set(dwrf::Config::STRIPE_SIZE, 1L << 30); diff --git a/velox/dwio/dwrf/test/FloatColumnWriterBenchmark.cpp b/velox/dwio/dwrf/test/FloatColumnWriterBenchmark.cpp index 0f965d0ca4af..3c952c29c08c 100644 --- a/velox/dwio/dwrf/test/FloatColumnWriterBenchmark.cpp +++ b/velox/dwio/dwrf/test/FloatColumnWriterBenchmark.cpp @@ -80,7 +80,7 @@ void runBenchmark(int nullEvery) { config, memory::memoryManager()->addRootPool("FloatColumnWriterBenchmark")}; auto writer = BaseColumnWriter::create(context, *typeWithId, 0); - writer->write(vector, common::Ranges::of(0, kVectorSize)); + writer->write(vector, velox::common::Ranges::of(0, kVectorSize)); } } diff --git a/velox/dwio/dwrf/test/IntEncoderBenchmark.cpp b/velox/dwio/dwrf/test/IntEncoderBenchmark.cpp index 644ddab0c587..845f7a42e640 100644 --- a/velox/dwio/dwrf/test/IntEncoderBenchmark.cpp +++ b/velox/dwio/dwrf/test/IntEncoderBenchmark.cpp @@ -58,7 +58,7 @@ static size_t generateAutoId2(int64_t startId, int64_t count) { for (int64_t i = 0; i < bufCount; ++i) { buffer[i] = currentId++; } - encoder->add(buffer, common::Ranges::of(0, bufCount), nullptr); + encoder->add(buffer, velox::common::Ranges::of(0, bufCount), nullptr); countRemaining -= bufCount; } return encoder->flush(); diff --git a/velox/dwio/dwrf/test/LayoutPlannerTests.cpp b/velox/dwio/dwrf/test/LayoutPlannerTests.cpp index a2de14e8e91f..0dc611f46fa9 100644 --- a/velox/dwio/dwrf/test/LayoutPlannerTests.cpp +++ b/velox/dwio/dwrf/test/LayoutPlannerTests.cpp @@ -99,7 +99,7 @@ TEST_F(LayoutPlannerTest, CreateNodeToColumnIdMapping) { TEST_F(LayoutPlannerTest, Basic) { auto config = std::make_shared(); config->set( - Config::COMPRESSION, common::CompressionKind::CompressionKind_NONE); + Config::COMPRESSION, velox::common::CompressionKind::CompressionKind_NONE); WriterContext context{ config, facebook::velox::memory::memoryManager()->addRootPool( diff --git a/velox/dwio/dwrf/test/ReaderTest.cpp b/velox/dwio/dwrf/test/ReaderTest.cpp index b132757ee54f..569585f85cec 100644 --- a/velox/dwio/dwrf/test/ReaderTest.cpp +++ b/velox/dwio/dwrf/test/ReaderTest.cpp @@ -2117,7 +2117,7 @@ TEST_F(TestReader, setRowNumberColumnInfo) { auto schema = asRowType(batches[0]->type()); auto [writer, reader] = createWriterReader(batches, pool()); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addAllChildFields(*schema); RowReaderOptions rowReaderOpts; rowReaderOpts.setScanSpec(spec); @@ -2131,7 +2131,7 @@ TEST_F(TestReader, setRowNumberColumnInfo) { verifyRowNumbers(*rowReader, pool(), 16); } spec->childByName("c0")->setFilter( - common::createBigintValues({1, 4, 5, 7, 11, 14}, false)); + velox::common::createBigintValues({1, 4, 5, 7, 11, 14}, false)); spec->resetCachedValues(true); { SCOPED_TRACE("Selective with filter"); @@ -2146,7 +2146,7 @@ TEST_F(TestReader, reuseRowNumberColumn) { auto schema = asRowType(batches[0]->type()); auto [writer, reader] = createWriterReader(batches, pool()); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addAllChildFields(*schema); RowReaderOptions rowReaderOpts; rowReaderOpts.setScanSpec(spec); @@ -2211,10 +2211,10 @@ TEST_F(TestReader, explicitRowNumberColumn) { }; auto batches = createBatches(integerValues); auto [writer, reader] = createWriterReader(batches, pool()); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addField("c0", 0); spec->addField("$row_number", 1) - ->setColumnType(common::ScanSpec::ColumnType::kRowIndex); + ->setColumnType(velox::common::ScanSpec::ColumnType::kRowIndex); RowReaderOptions rowReaderOpts; rowReaderOpts.setScanSpec(spec); { @@ -2223,7 +2223,7 @@ TEST_F(TestReader, explicitRowNumberColumn) { verifyRowNumbers(*rowReader, pool(), 16, true); } spec->childByName("c0")->setFilter( - common::createBigintValues({1, 4, 5, 7, 11, 14}, false)); + velox::common::createBigintValues({1, 4, 5, 7, 11, 14}, false)); spec->resetCachedValues(true); { SCOPED_TRACE("Selective with filter"); @@ -2248,7 +2248,7 @@ TEST_F(TestReader, failToReuseReaderNulls) { }); auto schema = asRowType(data->type()); auto [writer, reader] = createWriterReader({data}, pool()); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addAllChildFields(*schema); spec->childByName("c0")->childByName("a")->setFilter( std::make_unique( @@ -2300,11 +2300,11 @@ TEST_F(TestReader, readFlatMapsSomeEmpty) { auto [writer, reader] = createWriterReader({row}, pool(), config); auto schema = asRowType(row->type()); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addAllChildFields(*schema); spec->childByName("a") - ->childByName(common::ScanSpec::kMapKeysFieldName) - ->setFilter(common::createBigintValues({1, 2, 3}, false)); + ->childByName(velox::common::ScanSpec::kMapKeysFieldName) + ->setFilter(velox::common::createBigintValues({1, 2, 3}, false)); RowReaderOptions rowReaderOpts; rowReaderOpts.setScanSpec(spec); @@ -2366,11 +2366,11 @@ TEST_F(TestReader, readFlatMapsWithNullMaps) { auto [writer, reader] = createWriterReader({row}, pool(), config); auto schema = asRowType(row->type()); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addAllChildFields(*schema); spec->childByName("a") - ->childByName(common::ScanSpec::kMapKeysFieldName) - ->setFilter(common::createBigintValues({1, 2, 3}, false)); + ->childByName(velox::common::ScanSpec::kMapKeysFieldName) + ->setFilter(velox::common::createBigintValues({1, 2, 3}, false)); RowReaderOptions rowReaderOpts; rowReaderOpts.setScanSpec(spec); @@ -2439,7 +2439,7 @@ TEST_F(TestReader, readStructWithWholeBatchFiltered) { auto [writer, reader] = createWriterReader({row}, pool()); auto schema = asRowType(row->type()); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addAllChildFields(*schema); // Create a filter that will filter out all rows in the first batch. spec->childByName("c0")->setFilter(std::make_unique()); @@ -2496,7 +2496,7 @@ TEST_F(TestReader, readStringDictionaryAsFlat) { // for first batch. E2EWriterTestUtil::simpleFlushPolicyFactory(false)); auto rowType = reader->rowType(); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addAllChildFields(*rowType); RowReaderOptions rowReaderOpts; rowReaderOpts.setScanSpec(spec); @@ -2511,7 +2511,7 @@ TEST_F(TestReader, readStringDictionaryAsFlat) { dwio::common::RuntimeStatistics stats; rowReader->updateRuntimeStats(stats); ASSERT_EQ(stats.columnReaderStatistics.flattenStringDictionaryValues, 0); - spec->childByName("c0")->setFilter(std::make_unique( + spec->childByName("c0")->setFilter(std::make_unique( std::vector{"aaaaaaaaaaaaaaaaaaaa"}, false)); spec->resetCachedValues(true); rowReader = reader->createRowReader(rowReaderOpts); @@ -2533,7 +2533,7 @@ TEST_F(TestReader, missingSubfieldsNoResultReusing) { }); auto [writer, reader] = createWriterReader({batch}, pool()); auto schema = ROW({{"c0", ROW({{"c0", BIGINT()}, {"c1", VARCHAR()}})}}); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addAllChildFields(*schema); RowReaderOptions rowReaderOpts; rowReaderOpts.setScanSpec(spec); @@ -2562,11 +2562,11 @@ TEST_F(TestReader, selectiveStringDirectFastPath) { }); auto [writer, reader] = createWriterReader({batch}, pool()); auto schema = asRowType(batch->type()); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addAllChildFields(*schema); RowReaderOptions rowReaderOpts; rowReaderOpts.setScanSpec(spec); - spec->childByName("c0")->setFilter(common::createBigintValues({1}, false)); + spec->childByName("c0")->setFilter(velox::common::createBigintValues({1}, false)); auto rowReader = reader->createRowReader(rowReaderOpts); auto actual = BaseVector::create(schema, 0, pool()); ASSERT_EQ(rowReader->next(1024, actual), batch->size()); @@ -2588,11 +2588,11 @@ TEST_F(TestReader, selectiveStringDirect) { }); auto [writer, reader] = createWriterReader({batch}, pool()); auto schema = asRowType(batch->type()); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addAllChildFields(*schema); RowReaderOptions rowReaderOpts; rowReaderOpts.setScanSpec(spec); - spec->childByName("c0")->setFilter(common::createBigintValues({1}, false)); + spec->childByName("c0")->setFilter(velox::common::createBigintValues({1}, false)); auto rowReader = reader->createRowReader(rowReaderOpts); auto actual = BaseVector::create(schema, 0, pool()); ASSERT_EQ(rowReader->next(1024, actual), batch->size()); @@ -2612,7 +2612,7 @@ TEST_F(TestReader, selectiveFlatMapFastPathAllInlinedStringKeys) { config->set(dwrf::Config::MAP_FLAT_COLS, {0}); auto [writer, reader] = createWriterReader({row}, pool(), config); auto schema = asRowType(row->type()); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addAllChildFields(*schema); RowReaderOptions rowReaderOpts; rowReaderOpts.setScanSpec(spec); @@ -2631,7 +2631,7 @@ TEST_F(TestReader, skipLongString) { dwio::common::ReaderOptions readerOpts(pool()); readerOpts.setFileFormat(FileFormat::DWRF); auto reader = DwrfReader::create(std::move(input), readerOpts); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addField("c0", 0); spec->getOrCreateChild("c1")->setFilter( std::make_unique(true, false)); diff --git a/velox/dwio/dwrf/test/TestByteRLEEncoder.cpp b/velox/dwio/dwrf/test/TestByteRLEEncoder.cpp index 9377206804c1..131ad589f433 100644 --- a/velox/dwio/dwrf/test/TestByteRLEEncoder.cpp +++ b/velox/dwio/dwrf/test/TestByteRLEEncoder.cpp @@ -130,7 +130,7 @@ TEST_F(ByteRleEncoderTest, random_chars) { char* data = new char[102400]; generateData(102400, data); - encoder->add(data, common::Ranges::of(0, 102400), nullptr); + encoder->add(data, velox::common::Ranges::of(0, 102400), nullptr); encoder->flush(); decodeAndVerify(memSink, data, 102400, nullptr); @@ -151,7 +151,7 @@ TEST_F(ByteRleEncoderTest, random_chars_with_null) { uint64_t* nulls = new uint64_t[1600]; char* data = new char[102400]; generateData(102400, data, 377, nulls); - encoder->add(data, common::Ranges::of(0, 102400), nulls); + encoder->add(data, velox::common::Ranges::of(0, 102400), nulls); encoder->flush(); decodeAndVerify(memSink, data, 102400, nulls); @@ -179,7 +179,7 @@ TEST_F(BooleanRleEncoderTest, random_bits_not_aligned) { char* data = new char[1779]; generateBoolData(1779, data); - encoder->add(data, common::Ranges::of(0, 1779), nullptr); + encoder->add(data, velox::common::Ranges::of(0, 1779), nullptr); encoder->flush(); decodeAndVerifyBoolean(memSink, data, 1779, nullptr); @@ -199,7 +199,7 @@ TEST_F(BooleanRleEncoderTest, random_bits_aligned) { char* data = new char[8000]; generateBoolData(8000, data); - encoder->add(data, common::Ranges::of(0, 8000), nullptr); + encoder->add(data, velox::common::Ranges::of(0, 8000), nullptr); encoder->flush(); decodeAndVerifyBoolean(memSink, data, 8000, nullptr); @@ -220,7 +220,7 @@ TEST_F(BooleanRleEncoderTest, random_bits_aligned_with_null) { uint64_t* nulls = new uint64_t[125]; char* data = new char[8000]; generateBoolData(8000, data, 515, nulls); - encoder->add(data, common::Ranges::of(0, 8000), nulls); + encoder->add(data, velox::common::Ranges::of(0, 8000), nulls); encoder->flush(); decodeAndVerifyBoolean(memSink, data, 8000, nulls); diff --git a/velox/dwio/dwrf/test/TestColumnReader.cpp b/velox/dwio/dwrf/test/TestColumnReader.cpp index b0b730ed4cad..442ba01d2f90 100644 --- a/velox/dwio/dwrf/test/TestColumnReader.cpp +++ b/velox/dwio/dwrf/test/TestColumnReader.cpp @@ -63,12 +63,12 @@ void makeFieldSpecs( const std::string& pathPrefix, int32_t level, const std::shared_ptr& type, - common::ScanSpec* spec) { + velox::common::ScanSpec* spec) { for (auto i = 0; i < type->size(); ++i) { std::string path = level == 0 ? type->nameOf(i) : pathPrefix + "." + type->nameOf(i); - common::Subfield subfield(path); - common::ScanSpec* fieldSpec = spec->getOrCreateChild(subfield); + velox::common::Subfield subfield(path); + velox::common::ScanSpec* fieldSpec = spec->getOrCreateChild(subfield); fieldSpec->setProjectOut(true); if (level == 0) { fieldSpec->setChannel(i); @@ -118,7 +118,7 @@ class ColumnReaderTestBase { const std::shared_ptr& requestedType, const std::shared_ptr& fileType = nullptr, std::vector nodes = {}, - common::ScanSpec* scanSpec = nullptr) { + velox::common::ScanSpec* scanSpec = nullptr) { const std::shared_ptr& rowType = std::dynamic_pointer_cast(requestedType); if (parallelDecoding() && !executor_) { @@ -139,7 +139,7 @@ class ColumnReaderTestBase { if (useSelectiveReader()) { if (!scanSpec) { - scanSpec_ = std::make_unique("root"); + scanSpec_ = std::make_unique("root"); scanSpec_->addAllChildFields(*fileTypeWithId->type()); scanSpec = scanSpec_.get(); } @@ -230,7 +230,7 @@ class ColumnReaderTestBase { std::unique_ptr selectiveColumnReader_; private: - std::unique_ptr scanSpec_; + std::unique_ptr scanSpec_; ColumnReaderStatistics columnReaderStatistics_; }; @@ -472,7 +472,7 @@ class SchemaMismatchTest : public TestWithParam, // build columnReader_ and selectiveColumnReader_. They are used as // mismatch ColumnReaders - auto scanSpec2 = std::make_unique("root2"); + auto scanSpec2 = std::make_unique("root2"); buildReader(requestedType, fileType, {}, scanSpec2.get()); VectorPtr mismatchBatch = newBatch(requestedType); if (columnReader_) { @@ -911,7 +911,7 @@ TEST_P(TestColumnReader, testIntegerRLEv2) { auto fileType = TypeWithId::create(rowType)->type(); VectorPtr batch = newBatch(rowType); if (useSelectiveReader()) { - auto scanSpec = std::make_unique("root"); + auto scanSpec = std::make_unique("root"); scanSpec->addAllChildFields(*fileType); scanSpec->childByName("col_0")->setFilter( std::make_unique(2100, 2140, false)); diff --git a/velox/dwio/dwrf/test/TestIntDirect.cpp b/velox/dwio/dwrf/test/TestIntDirect.cpp index 6d0cb721cc45..215b6fb6ba92 100644 --- a/velox/dwio/dwrf/test/TestIntDirect.cpp +++ b/velox/dwio/dwrf/test/TestIntDirect.cpp @@ -50,7 +50,7 @@ void testInts(std::function generator) { auto encoder = createDirectEncoder(std::move(output), vInt, sizeof(T)); - encoder->add(buffer.data(), common::Ranges::of(0, count), nulls.data()); + encoder->add(buffer.data(), velox::common::Ranges::of(0, count), nulls.data()); TestPositionRecorder recorder; recorder.addEntry(); diff --git a/velox/dwio/dwrf/test/TestRLEv1Encoder.cpp b/velox/dwio/dwrf/test/TestRLEv1Encoder.cpp index 627759ec7a59..1d056cf541d8 100644 --- a/velox/dwio/dwrf/test/TestRLEv1Encoder.cpp +++ b/velox/dwio/dwrf/test/TestRLEv1Encoder.cpp @@ -106,10 +106,10 @@ TEST_F(RleEncoderV1Test, encodeMinAndMax) { std::make_unique(holder), true, 8); auto data = folly::make_array(INT64_MIN, INT64_MAX, INT64_MIN); - encoder.add(data.data(), common::Ranges::of(0, 2), nullptr); + encoder.add(data.data(), velox::common::Ranges::of(0, 2), nullptr); EXPECT_TRUE(encoder.overflow_); - encoder.add(data.data(), common::Ranges::of(2, 3), nullptr); + encoder.add(data.data(), velox::common::Ranges::of(2, 3), nullptr); EXPECT_TRUE(encoder.overflow_); encoder.flush(); @@ -126,10 +126,10 @@ TEST_F(RleEncoderV1Test, encodeMinAndMaxint32) { std::make_unique(holder), true, 8); auto data = folly::make_array(INT32_MIN, INT32_MAX, INT32_MIN); - encoder.add(data.data(), common::Ranges::of(0, 2), nullptr); + encoder.add(data.data(), velox::common::Ranges::of(0, 2), nullptr); EXPECT_FALSE(encoder.overflow_); - encoder.add(data.data(), common::Ranges::of(2, 3), nullptr); + encoder.add(data.data(), velox::common::Ranges::of(2, 3), nullptr); EXPECT_FALSE(encoder.overflow_); encoder.flush(); @@ -148,7 +148,7 @@ TEST_F(RleEncoderV1Test, deltaIncreasingSequanceUnsigned) { int64_t* data = new int64_t[1024]; generateData(1024, 0, 1, false, data); - encoder.add(data, common::Ranges::of(0, 1024), nullptr); + encoder.add(data, velox::common::Ranges::of(0, 1024), nullptr); encoder.flush(); decodeAndVerify(memSink, data, 1024, nullptr); @@ -167,7 +167,7 @@ TEST_F(RleEncoderV1Test, deltaIncreasingSequanceUnsignedNull) { uint64_t* nulls = new uint64_t[256]; int64_t* data = new int64_t[1024]; generateData(1024, 0, 1, false, data, 100, nulls); - encoder.add(data, common::Ranges::of(0, 1024), nulls); + encoder.add(data, velox::common::Ranges::of(0, 1024), nulls); encoder.flush(); decodeAndVerify(memSink, data, 1024, nulls); @@ -186,7 +186,7 @@ TEST_F(RleEncoderV1Test, deltaDecreasingSequanceUnsigned) { int64_t* data = new int64_t[1024]; generateData(1024, 5000, -3, false, data); - encoder.add(data, common::Ranges::of(0, 1024), nullptr); + encoder.add(data, velox::common::Ranges::of(0, 1024), nullptr); encoder.flush(); decodeAndVerify(memSink, data, 1024, nullptr); @@ -204,7 +204,7 @@ TEST_F(RleEncoderV1Test, deltaDecreasingSequanceSigned) { int64_t* data = new int64_t[1024]; generateData(1024, 100, -3, false, data); - encoder.add(data, common::Ranges::of(0, 1024), nullptr); + encoder.add(data, velox::common::Ranges::of(0, 1024), nullptr); encoder.flush(); decodeAndVerify(memSink, data, 1024, nullptr); @@ -223,7 +223,7 @@ TEST_F(RleEncoderV1Test, deltaDecreasingSequanceSignedNull) { uint64_t* nulls = new uint64_t[256]; int64_t* data = new int64_t[1024]; generateData(1024, 100, -3, false, data, 500, nulls); - encoder.add(data, common::Ranges::of(0, 1024), nulls); + encoder.add(data, velox::common::Ranges::of(0, 1024), nulls); encoder.flush(); decodeAndVerify(memSink, data, 1024, nulls); @@ -242,7 +242,7 @@ TEST_F(RleEncoderV1Test, randomSequanceSigned) { int64_t* data = new int64_t[1024]; generateData(1024, 0, 0, true, data); - encoder.add(data, common::Ranges::of(0, 1024), nullptr); + encoder.add(data, velox::common::Ranges::of(0, 1024), nullptr); encoder.flush(); decodeAndVerify(memSink, data, 1024, nullptr); @@ -261,7 +261,7 @@ TEST_F(RleEncoderV1Test, allNull) { uint64_t* nulls = new uint64_t[256]; int64_t* data = new int64_t[1024]; generateData(1024, 100, -3, false, data, 1024, nulls); - encoder.add(data, common::Ranges::of(0, 1024), nulls); + encoder.add(data, velox::common::Ranges::of(0, 1024), nulls); encoder.flush(); decodeAndVerify(memSink, data, 1024, nulls); @@ -281,7 +281,7 @@ TEST_F(RleEncoderV1Test, recordPosition) { constexpr size_t size = 256; std::array data; generateData(size, 100, 1, false, data.data()); - encoder.add(data.data(), common::Ranges::of(0, size), nullptr); + encoder.add(data.data(), velox::common::Ranges::of(0, size), nullptr); TestPositionRecorder recorder; encoder.recordPosition(recorder); @@ -302,7 +302,7 @@ TEST_F(RleEncoderV1Test, backfillPosition) { constexpr size_t size = 256; std::array data; generateData(size, 100, 1, false, data.data()); - encoder.add(data.data(), common::Ranges::of(0, size), nullptr); + encoder.add(data.data(), velox::common::Ranges::of(0, size), nullptr); TestPositionRecorder recorder; encoder.recordPosition(recorder); @@ -320,7 +320,7 @@ TEST_F(RleEncoderV1Test, backfillPosition) { } std::array moreData; generateData(size * 2, 200, 1, false, moreData.data()); - encoder.add(moreData.data(), common::Ranges::of(0, size * 2), nullptr); + encoder.add(moreData.data(), velox::common::Ranges::of(0, size * 2), nullptr); recorder.addEntry(); encoder.recordPosition(recorder, 2); { diff --git a/velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp b/velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp index 3161ee991e99..10f5d4e1391d 100644 --- a/velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp +++ b/velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp @@ -83,7 +83,7 @@ TEST_F(TestStatisticsBuilderUtils, addIntegerValues) { auto vec = makeFlatVectorNoNulls(pool_.get(), size, values); { StatisticsBuilderUtils::addValues( - builder, vec, common::Ranges::of(0, size)); + builder, vec, velox::common::Ranges::of(0, size)); auto stats = builder.build(); auto intStats = dynamic_cast(stats.get()); EXPECT_EQ(10, intStats->getNumberOfValues()); @@ -102,7 +102,7 @@ TEST_F(TestStatisticsBuilderUtils, addIntegerValues) { { StatisticsBuilderUtils::addValues( - builder, vec, common::Ranges::of(0, size)); + builder, vec, velox::common::Ranges::of(0, size)); auto stats = builder.build(); auto intStats = dynamic_cast(stats.get()); EXPECT_EQ(19, intStats->getNumberOfValues()); @@ -129,7 +129,7 @@ TEST_F(TestStatisticsBuilderUtils, addDoubleValues) { { auto vec = makeFlatVectorNoNulls(pool_.get(), size, values); StatisticsBuilderUtils::addValues( - builder, vec, common::Ranges::of(0, size)); + builder, vec, velox::common::Ranges::of(0, size)); auto stats = builder.build(); auto doubleStats = dynamic_cast(stats.get()); EXPECT_EQ(10, doubleStats->getNumberOfValues()); @@ -148,7 +148,7 @@ TEST_F(TestStatisticsBuilderUtils, addDoubleValues) { auto vec = makeFlatVector(pool_.get(), nulls, 1, size, values); StatisticsBuilderUtils::addValues( - builder, vec, common::Ranges::of(0, size)); + builder, vec, velox::common::Ranges::of(0, size)); auto stats = builder.build(); auto doubleStats = dynamic_cast(stats.get()); EXPECT_EQ(19, doubleStats->getNumberOfValues()); @@ -176,7 +176,7 @@ TEST_F(TestStatisticsBuilderUtils, addStringValues) { { auto vec = makeFlatVectorNoNulls(pool_.get(), size, values); StatisticsBuilderUtils::addValues( - builder, vec, common::Ranges::of(0, size)); + builder, vec, velox::common::Ranges::of(0, size)); auto stats = builder.build(); auto strStats = dynamic_cast(stats.get()); EXPECT_EQ(10, strStats->getNumberOfValues()); @@ -194,7 +194,7 @@ TEST_F(TestStatisticsBuilderUtils, addStringValues) { { auto vec = makeFlatVector(pool_.get(), nulls, 1, size, values); StatisticsBuilderUtils::addValues( - builder, vec, common::Ranges::of(0, size)); + builder, vec, velox::common::Ranges::of(0, size)); auto stats = builder.build(); auto strStats = dynamic_cast(stats.get()); EXPECT_EQ(19, strStats->getNumberOfValues()); @@ -223,7 +223,7 @@ TEST_F(TestStatisticsBuilderUtils, addBooleanValues) { auto vec = makeFlatVectorNoNulls(pool_.get(), size, values); StatisticsBuilderUtils::addValues( - builder, vec, common::Ranges::of(0, size)); + builder, vec, velox::common::Ranges::of(0, size)); auto stats = builder.build(); auto boolStats = dynamic_cast(stats.get()); EXPECT_EQ(9, boolStats->getTrueCount().value()); @@ -239,7 +239,7 @@ TEST_F(TestStatisticsBuilderUtils, addBooleanValues) { auto vec = makeFlatVector(pool_.get(), nulls, 1, size, values); StatisticsBuilderUtils::addValues( - builder, vec, common::Ranges::of(0, size)); + builder, vec, velox::common::Ranges::of(0, size)); auto stats = builder.build(); auto boolStats = dynamic_cast(stats.get()); EXPECT_EQ(17, boolStats->getTrueCount().value()); @@ -260,7 +260,7 @@ TEST_F(TestStatisticsBuilderUtils, addValues) { auto vec = makeFlatVectorNoNulls(pool_.get(), size, values); StatisticsBuilderUtils::addValues( - builder, vec, common::Ranges::of(0, size)); + builder, vec, velox::common::Ranges::of(0, size)); auto stats = builder.build(); EXPECT_EQ(10, stats->getNumberOfValues()); EXPECT_FALSE(stats->hasNull().value()); @@ -274,7 +274,7 @@ TEST_F(TestStatisticsBuilderUtils, addValues) { auto vec = makeFlatVector(pool_.get(), nulls, 1, size, values); StatisticsBuilderUtils::addValues( - builder, vec, common::Ranges::of(0, size)); + builder, vec, velox::common::Ranges::of(0, size)); auto stats = builder.build(); EXPECT_EQ(19, stats->getNumberOfValues()); EXPECT_TRUE(stats->hasNull().value()); @@ -300,7 +300,7 @@ TEST_F(TestStatisticsBuilderUtils, addBinaryValues) { auto vec = makeFlatVectorNoNulls(pool_.get(), size, values); StatisticsBuilderUtils::addValues( - builder, vec, common::Ranges::of(0, size)); + builder, vec, velox::common::Ranges::of(0, size)); auto stats = builder.build(); auto binStats = dynamic_cast(stats.get()); EXPECT_EQ(10, binStats->getNumberOfValues()); @@ -316,7 +316,7 @@ TEST_F(TestStatisticsBuilderUtils, addBinaryValues) { auto vec = makeFlatVector(pool_.get(), nulls, 1, size, values); StatisticsBuilderUtils::addValues( - builder, vec, common::Ranges::of(0, size)); + builder, vec, velox::common::Ranges::of(0, size)); auto stats = builder.build(); auto binStats = dynamic_cast(stats.get()); EXPECT_EQ(19, binStats->getNumberOfValues()); diff --git a/velox/dwio/dwrf/utils/ProtoUtils.cpp b/velox/dwio/dwrf/utils/ProtoUtils.cpp index e907c4126d3e..ff0c0b7dc940 100644 --- a/velox/dwio/dwrf/utils/ProtoUtils.cpp +++ b/velox/dwio/dwrf/utils/ProtoUtils.cpp @@ -63,7 +63,7 @@ void ProtoUtils::writeType( // testing before the ORC footer write is implemented. auto kind = SchemaType::kind; self->set_kind(kind); - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::dwrf::ProtoUtils::writeType", &kindSet); } else { auto kind = diff --git a/velox/dwio/dwrf/writer/ColumnWriter.cpp b/velox/dwio/dwrf/writer/ColumnWriter.cpp index a1084f4d6104..bd74dc19c5d5 100644 --- a/velox/dwio/dwrf/writer/ColumnWriter.cpp +++ b/velox/dwio/dwrf/writer/ColumnWriter.cpp @@ -34,7 +34,7 @@ namespace facebook::velox::dwrf { WriterContext::LocalDecodedVector BaseColumnWriter::decode( const VectorPtr& slice, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { auto& selected = context_.getSharedSelectivityVector(slice->size()); // initialize selected.clearAll(); @@ -65,7 +65,7 @@ class ByteRleColumnWriter : public BaseColumnWriter { reset(); } - uint64_t write(const VectorPtr& slice, const common::Ranges& ranges) override; + uint64_t write(const VectorPtr& slice, const velox::common::Ranges& ranges) override; void flush( std::function encodingFactory, @@ -86,7 +86,7 @@ class ByteRleColumnWriter : public BaseColumnWriter { template <> uint64_t ByteRleColumnWriter::write( const VectorPtr& slice, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { static_assert(sizeof(int8_t) == sizeof(char), "unexpected type width"); static_assert(NULL_SIZE == 1, "unexpected raw data size"); if (slice->encoding() == VectorEncoding::Simple::FLAT) { @@ -136,7 +136,7 @@ uint64_t ByteRleColumnWriter::write( template <> uint64_t ByteRleColumnWriter::write( const VectorPtr& slice, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { static_assert(sizeof(bool) == sizeof(char), "unexpected type width"); static_assert(NULL_SIZE == 1, "unexpected raw data size"); if (slice->encoding() == VectorEncoding::Simple::FLAT) { @@ -228,7 +228,7 @@ class IntegerColumnWriter : public BaseColumnWriter { reset(); } - uint64_t write(const VectorPtr& slice, const common::Ranges& ranges) override; + uint64_t write(const VectorPtr& slice, const velox::common::Ranges& ranges) override; void reset() override { // Lots of decisions regarding the presence of streams are made at flush @@ -366,9 +366,9 @@ class IntegerColumnWriter : public BaseColumnWriter { private: uint64_t writeDict( DecodedVector& decodedVector, - const common::Ranges& ranges); + const velox::common::Ranges& ranges); - uint64_t writeDirect(const VectorPtr& slice, const common::Ranges& ranges); + uint64_t writeDirect(const VectorPtr& slice, const velox::common::Ranges& ranges); void ensureValidStreamWriters(bool dictEncoding) { // Ensure we have valid streams for exactly one encoding. @@ -479,7 +479,7 @@ class IntegerColumnWriter : public BaseColumnWriter { template uint64_t IntegerColumnWriter::write( const VectorPtr& slice, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { if (useDictionaryEncoding_) { // Decode and then write auto localDecoded = decode(slice, ranges); @@ -506,7 +506,7 @@ uint64_t IntegerColumnWriter::write( template uint64_t IntegerColumnWriter::writeDict( DecodedVector& decodedVector, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { auto& statsBuilder = dynamic_cast(*indexStatsBuilder_); writeNulls(decodedVector, ranges); @@ -545,7 +545,7 @@ uint64_t IntegerColumnWriter::writeDict( template uint64_t IntegerColumnWriter::writeDirect( const VectorPtr& slice, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { ensureValidStreamWriters(false); auto* flatVector = slice->asFlatVector(); VELOX_CHECK_NOT_NULL(flatVector, "unexpected vector type"); @@ -600,7 +600,7 @@ void IntegerColumnWriter::populateDictionaryEncodingStreams() { end, [&](auto index) { return dictEncoder_.getInDict()[rows_[index]]; }, [&](auto buf, auto size) { - inDictionary_->add(buf, common::Ranges::of(0, size), nullptr); + inDictionary_->add(buf, velox::common::Ranges::of(0, size), nullptr); }); } @@ -615,7 +615,7 @@ void IntegerColumnWriter::populateDictionaryEncodingStreams() { end, [&](auto index) { return dictEncoder_.getLookupTable()[rows_[index]]; }, [&](auto buf, auto size) { - data_->add(buf, common::Ranges::of(0, size), nullptr); + data_->add(buf, velox::common::Ranges::of(0, size), nullptr); }); }; @@ -643,7 +643,7 @@ void IntegerColumnWriter::convertToDirectEncoding() { end, [&](auto index) { return dictEncoder_.getKey(rows_[index]); }, [&](auto buf, auto size) { - dataDirect_->add(buf, common::Ranges::of(0, size), nullptr); + dataDirect_->add(buf, velox::common::Ranges::of(0, size), nullptr); }); }; @@ -676,7 +676,7 @@ class TimestampColumnWriter : public BaseColumnWriter { reset(); } - uint64_t write(const VectorPtr& slice, const common::Ranges& ranges) override; + uint64_t write(const VectorPtr& slice, const velox::common::Ranges& ranges) override; void flush( std::function encodingFactory, @@ -722,7 +722,7 @@ class DecimalColumnWriter : public BaseColumnWriter { reset(); } - uint64_t write(const VectorPtr& slice, const common::Ranges& ranges) + uint64_t write(const VectorPtr& slice, const velox::common::Ranges& ranges) override { VELOX_CHECK( slice->type()->equivalent(*type_), @@ -852,7 +852,7 @@ FOLLY_ALWAYS_INLINE int64_t formatNanos(uint64_t nanos) { uint64_t TimestampColumnWriter::write( const VectorPtr& slice, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { // Timestamp is not frequently used, always decode so we have less branches to // deal with. auto localDecoded = decode(slice, ranges); @@ -934,7 +934,7 @@ class StringColumnWriter : public BaseColumnWriter { reset(); } - uint64_t write(const VectorPtr& slice, const common::Ranges& ranges) override; + uint64_t write(const VectorPtr& slice, const velox::common::Ranges& ranges) override; void reset() override { // Lots of decisions regarding the presence of streams are made at flush @@ -1075,11 +1075,11 @@ class StringColumnWriter : public BaseColumnWriter { private: uint64_t writeDict( DecodedVector& decodedVector, - const common::Ranges& ranges); + const velox::common::Ranges& ranges); uint64_t writeDirect( DecodedVector& decodedVector, - const common::Ranges& ranges); + const velox::common::Ranges& ranges); void ensureValidStreamWriters(bool dictEncoding) { // Ensure we have exactly one valid data stream. @@ -1208,7 +1208,7 @@ class StringColumnWriter : public BaseColumnWriter { uint64_t StringColumnWriter::write( const VectorPtr& slice, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { auto localDecoded = decode(slice, ranges); auto& decodedVector = localDecoded.get(); @@ -1221,7 +1221,7 @@ uint64_t StringColumnWriter::write( uint64_t StringColumnWriter::writeDict( DecodedVector& decodedVector, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { auto& statsBuilder = dynamic_cast(*indexStatsBuilder_); writeNulls(decodedVector, ranges); @@ -1261,7 +1261,7 @@ uint64_t StringColumnWriter::writeDict( uint64_t StringColumnWriter::writeDirect( DecodedVector& decodedVector, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { auto& statsBuilder = dynamic_cast(*indexStatsBuilder_); writeNulls(decodedVector, ranges); @@ -1298,7 +1298,7 @@ uint64_t StringColumnWriter::writeDirect( } if (lengths.size() > 0) { dataDirectLength_->add( - lengths.data(), common::Ranges::of(0, lengths.size()), nullptr); + lengths.data(), velox::common::Ranges::of(0, lengths.size()), nullptr); } if (nullCount > 0) { @@ -1335,7 +1335,7 @@ void StringColumnWriter::populateDictionaryEncodingStreams() { strideDictCounts, [&](auto buf, auto size) { dictionaryData_->write(buf, size); }, [&](auto buf, auto size) { - dictionaryDataLength_->add(buf, common::Ranges::of(0, size), nullptr); + dictionaryDataLength_->add(buf, velox::common::Ranges::of(0, size), nullptr); }); // When all the Keys are in Dictionary, inDictionaryStream is omitted. @@ -1363,7 +1363,7 @@ void StringColumnWriter::populateDictionaryEncodingStreams() { getMemoryPool(MemoryUsageCategory::GENERAL), 64 * 1024, [&](auto buf, auto size) { - inDictionary_->add(buf, common::Ranges::of(0, size), nullptr); + inDictionary_->add(buf, velox::common::Ranges::of(0, size), nullptr); }); auto errorGuard = folly::makeGuard([&inDictWriter]() { inDictWriter.abort(); }); @@ -1393,7 +1393,7 @@ void StringColumnWriter::populateDictionaryEncodingStreams() { 64 * 1024, [&](auto buf, auto size) { strideDictionaryDataLength_->add( - buf, common::Ranges::of(0, size), nullptr); + buf, velox::common::Ranges::of(0, size), nullptr); }); auto errorGuard = folly::makeGuard( [&strideLengthWriter]() { strideLengthWriter.abort(); }); @@ -1416,7 +1416,7 @@ void StringColumnWriter::populateDictionaryEncodingStreams() { end, [&](auto index) { return lookupTable[rows_[index]]; }, [&](auto buf, auto size) { - data_->add(buf, common::Ranges::of(0, size), nullptr); + data_->add(buf, velox::common::Ranges::of(0, size), nullptr); }); }; @@ -1442,7 +1442,7 @@ void StringColumnWriter::convertToDirectEncoding() { getMemoryPool(MemoryUsageCategory::GENERAL), 64 * 1024, [&](auto buf, auto size) { - dataDirectLength_->add(buf, common::Ranges::of(0, size), nullptr); + dataDirectLength_->add(buf, velox::common::Ranges::of(0, size), nullptr); }); auto errorGuard = folly::makeGuard([&lengthWriter]() { lengthWriter.abort(); }); @@ -1478,7 +1478,7 @@ class FloatColumnWriter : public BaseColumnWriter { reset(); } - uint64_t write(const VectorPtr& slice, const common::Ranges& ranges) override; + uint64_t write(const VectorPtr& slice, const velox::common::Ranges& ranges) override; void flush( std::function encodingFactory, @@ -1499,7 +1499,7 @@ class FloatColumnWriter : public BaseColumnWriter { template uint64_t FloatColumnWriter::write( const VectorPtr& slice, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { static_assert(folly::kIsLittleEndian, "not supported"); auto& statsBuilder = dynamic_cast(*indexStatsBuilder_); @@ -1609,7 +1609,7 @@ class BinaryColumnWriter : public BaseColumnWriter { reset(); } - uint64_t write(const VectorPtr& slice, const common::Ranges& ranges) override; + uint64_t write(const VectorPtr& slice, const velox::common::Ranges& ranges) override; void flush( std::function encodingFactory, @@ -1632,7 +1632,7 @@ class BinaryColumnWriter : public BaseColumnWriter { uint64_t BinaryColumnWriter::write( const VectorPtr& slice, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { auto& statsBuilder = dynamic_cast(*indexStatsBuilder_); uint64_t rawSize = 0; @@ -1702,7 +1702,7 @@ uint64_t BinaryColumnWriter::write( if (lengths.size() > 0) { lengths_->add( - lengths.data(), common::Ranges::of(0, lengths.size()), nullptr); + lengths.data(), velox::common::Ranges::of(0, lengths.size()), nullptr); } if (nullCount > 0) { statsBuilder.setHasNull(); @@ -1723,7 +1723,7 @@ class StructColumnWriter : public BaseColumnWriter { reset(); } - uint64_t write(const VectorPtr& slice, const common::Ranges& ranges) override; + uint64_t write(const VectorPtr& slice, const velox::common::Ranges& ranges) override; void flush( std::function encodingFactory, @@ -1737,13 +1737,13 @@ class StructColumnWriter : public BaseColumnWriter { private: uint64_t writeChildrenAndStats( const RowVector* rowSlice, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, uint64_t nullCount); }; uint64_t StructColumnWriter::writeChildrenAndStats( const RowVector* rowSlice, - const common::Ranges& ranges, + const velox::common::Ranges& ranges, uint64_t nullCount) { uint64_t rawSize = 0; if (ranges.size() > 0) { @@ -1762,13 +1762,13 @@ uint64_t StructColumnWriter::writeChildrenAndStats( uint64_t StructColumnWriter::write( const VectorPtr& slice, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { // Special case for writing the root. Root writer accepts rows, so all are // not null. if (isRoot()) { - common::Ranges childRanges; + velox::common::Ranges childRanges; const RowVector* rowSlice; - const common::Ranges* childRangesPtr; + const velox::common::Ranges* childRangesPtr; if (slice->encoding() != VectorEncoding::Simple::ROW) { auto localDecoded = decode(slice, ranges); auto& decodedVector = localDecoded.get(); @@ -1792,9 +1792,9 @@ uint64_t StructColumnWriter::write( // General case for writing row (struct) uint64_t nullCount = 0; - common::Ranges childRanges; + velox::common::Ranges childRanges; const RowVector* rowSlice; - const common::Ranges* childRangesPtr; + const velox::common::Ranges* childRangesPtr; if (slice->encoding() != VectorEncoding::Simple::ROW) { auto localDecoded = decode(slice, ranges); auto& decodedVector = localDecoded.get(); @@ -1852,7 +1852,7 @@ class ListColumnWriter : public BaseColumnWriter { reset(); } - uint64_t write(const VectorPtr& slice, const common::Ranges& ranges) override; + uint64_t write(const VectorPtr& slice, const velox::common::Ranges& ranges) override; void flush( std::function encodingFactory, @@ -1873,11 +1873,11 @@ class ListColumnWriter : public BaseColumnWriter { uint64_t ListColumnWriter::write( const VectorPtr& slice, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { DataBuffer nonNullLengths{ getMemoryPool(MemoryUsageCategory::GENERAL)}; nonNullLengths.reserve(ranges.size()); - common::Ranges childRanges; + velox::common::Ranges childRanges; uint64_t nullCount = 0; const ArrayVector* arraySlice; @@ -1945,7 +1945,7 @@ uint64_t ListColumnWriter::write( if (nonNullLengths.size()) { lengths_->add( nonNullLengths.data(), - common::Ranges::of(0, nonNullLengths.size()), + velox::common::Ranges::of(0, nonNullLengths.size()), nullptr); } @@ -1979,7 +1979,7 @@ class MapColumnWriter : public BaseColumnWriter { reset(); } - uint64_t write(const VectorPtr& slice, const common::Ranges& ranges) override; + uint64_t write(const VectorPtr& slice, const velox::common::Ranges& ranges) override; void flush( std::function encodingFactory, @@ -2001,11 +2001,11 @@ class MapColumnWriter : public BaseColumnWriter { uint64_t MapColumnWriter::write( const VectorPtr& slice, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { DataBuffer nonNullLengths{ getMemoryPool(MemoryUsageCategory::GENERAL)}; nonNullLengths.reserve(ranges.size()); - common::Ranges childRanges; + velox::common::Ranges childRanges; uint64_t nullCount = 0; const MapVector* mapSlice; @@ -2073,7 +2073,7 @@ uint64_t MapColumnWriter::write( if (nonNullLengths.size()) { lengths_->add( nonNullLengths.data(), - common::Ranges::of(0, nonNullLengths.size()), + velox::common::Ranges::of(0, nonNullLengths.size()), nullptr); } diff --git a/velox/dwio/dwrf/writer/ColumnWriter.h b/velox/dwio/dwrf/writer/ColumnWriter.h index 98c5691babb9..bbb60485fc0e 100644 --- a/velox/dwio/dwrf/writer/ColumnWriter.h +++ b/velox/dwio/dwrf/writer/ColumnWriter.h @@ -38,7 +38,7 @@ class ColumnWriter { virtual uint64_t write( const VectorPtr& slice, - const common::Ranges& ranges) = 0; + const velox::common::Ranges& ranges) = 0; virtual void createIndexEntry() = 0; @@ -175,7 +175,7 @@ class BaseColumnWriter : public ColumnWriter { fileStatsBuilder_ = StatisticsBuilder::create(*type.type(), options); } - uint64_t writeNulls(const VectorPtr& slice, const common::Ranges& ranges) { + uint64_t writeNulls(const VectorPtr& slice, const velox::common::Ranges& ranges) { if (FOLLY_UNLIKELY(ranges.size() == 0)) { return 0; } @@ -191,7 +191,7 @@ class BaseColumnWriter : public ColumnWriter { /// Function used only for the cases dealing with Dictionary vectors uint64_t writeNulls( const DecodedVector& decoded, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { if (FOLLY_UNLIKELY(ranges.size() == 0)) { return 0; } @@ -256,7 +256,7 @@ class BaseColumnWriter : public ColumnWriter { WriterContext::LocalDecodedVector decode( const VectorPtr& slice, - const common::Ranges& ranges); + const velox::common::Ranges& ranges); const dwio::common::TypeWithId& type_; std::vector> children_; diff --git a/velox/dwio/dwrf/writer/FlatMapColumnWriter.cpp b/velox/dwio/dwrf/writer/FlatMapColumnWriter.cpp index 09297b38be61..fdb49e4f9f0b 100644 --- a/velox/dwio/dwrf/writer/FlatMapColumnWriter.cpp +++ b/velox/dwio/dwrf/writer/FlatMapColumnWriter.cpp @@ -325,7 +325,7 @@ class Decoded { template uint64_t -iterateMaps(const common::Ranges& ranges, const Map& map, const MapOp& mapOp) { +iterateMaps(const velox::common::Ranges& ranges, const Map& map, const MapOp& mapOp) { uint64_t nullCount = 0; if (map.hasNulls()) { for (auto& index : ranges) { @@ -348,7 +348,7 @@ iterateMaps(const common::Ranges& ranges, const Map& map, const MapOp& mapOp) { template uint64_t FlatMapColumnWriter::write( const VectorPtr& slice, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { switch (slice->typeKind()) { case TypeKind::MAP: return writeMap(slice, ranges); @@ -367,13 +367,13 @@ uint64_t FlatMapColumnWriter::write( template uint64_t FlatMapColumnWriter::writeMap( const VectorPtr& slice, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { // Define variables captured and used by below lambdas. const vector_size_t* offsets; const vector_size_t* lengths; uint64_t rawSize = 0; uint64_t mapCount = 0; - common::Ranges keyRanges; + velox::common::Ranges keyRanges; // Lambda that iterates keys of a map and records the offsets to write to // particular value node. @@ -472,8 +472,8 @@ uint64_t FlatMapColumnWriter::writeMap( } template -common::Ranges getNonNullRanges(const common::Ranges& ranges, const Row& row) { - common::Ranges nonNullRanges; +common::Ranges getNonNullRanges(const velox::common::Ranges& ranges, const Row& row) { + velox::common::Ranges nonNullRanges; if (row.hasNulls()) { for (auto& index : ranges) { if (!row.isNullAt(index)) { @@ -489,9 +489,9 @@ common::Ranges getNonNullRanges(const common::Ranges& ranges, const Row& row) { template uint64_t FlatMapColumnWriter::writeRow( const VectorPtr& slice, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { uint64_t rawSize = 0; - common::Ranges nonNullRanges; + velox::common::Ranges nonNullRanges; const RowVector* rowSlice = slice->as(); if (rowSlice) { diff --git a/velox/dwio/dwrf/writer/FlatMapColumnWriter.h b/velox/dwio/dwrf/writer/FlatMapColumnWriter.h index 6e50a0e435ac..a793dddd2e53 100644 --- a/velox/dwio/dwrf/writer/FlatMapColumnWriter.h +++ b/velox/dwio/dwrf/writer/FlatMapColumnWriter.h @@ -136,7 +136,7 @@ class ValueWriter { uint64_t writeBuffers(const VectorPtr& values, uint32_t mapCount) { if (mapCount) { inMap_->add( - inMapBuffer_.data(), common::Ranges::of(0, mapCount), nullptr); + inMapBuffer_.data(), velox::common::Ranges::of(0, mapCount), nullptr); } if (values) { @@ -148,12 +148,12 @@ class ValueWriter { // used for struct encoding writer uint64_t writeBuffers( const VectorPtr& values, - const common::Ranges& nonNullRanges, + const velox::common::Ranges& nonNullRanges, const BufferPtr& inMapBuffer /* all 1 */) { if (nonNullRanges.size()) { inMap_->add( inMapBuffer->as(), - common::Ranges::of(0, nonNullRanges.size()), + velox::common::Ranges::of(0, nonNullRanges.size()), nullptr); } @@ -170,7 +170,7 @@ class ValueWriter { inMapBuffer_.reserve(count); std::memset(inMapBuffer_.data(), 0, count); - inMap_->add(inMapBuffer_.data(), common::Ranges::of(0, count), nullptr); + inMap_->add(inMapBuffer_.data(), velox::common::Ranges::of(0, count), nullptr); } uint32_t getSequence() const { @@ -214,7 +214,7 @@ class ValueWriter { std::unique_ptr inMap_; std::unique_ptr columnWriter_; dwio::common::DataBuffer inMapBuffer_; - common::Ranges ranges_; + velox::common::Ranges ranges_; const bool collectMapStats_; }; @@ -263,7 +263,7 @@ class FlatMapColumnWriter : public BaseColumnWriter { const dwio::common::TypeWithId& type, const uint32_t sequence); - uint64_t write(const VectorPtr& slice, const common::Ranges& ranges) override; + uint64_t write(const VectorPtr& slice, const velox::common::Ranges& ranges) override; void flush( std::function encodingFactory, @@ -284,8 +284,8 @@ class FlatMapColumnWriter : public BaseColumnWriter { ValueWriter& getValueWriter(KeyType key, uint32_t inMapSize); // write() calls writeMap() or writeRow() depending on input type - uint64_t writeMap(const VectorPtr& slice, const common::Ranges& ranges); - uint64_t writeRow(const VectorPtr& slice, const common::Ranges& ranges); + uint64_t writeMap(const VectorPtr& slice, const velox::common::Ranges& ranges); + uint64_t writeRow(const VectorPtr& slice, const velox::common::Ranges& ranges); void clearNodes(); diff --git a/velox/dwio/dwrf/writer/IntegerDictionaryEncoder.h b/velox/dwio/dwrf/writer/IntegerDictionaryEncoder.h index 3ce91eb82070..07d9cd8a326f 100644 --- a/velox/dwio/dwrf/writer/IntegerDictionaryEncoder.h +++ b/velox/dwio/dwrf/writer/IntegerDictionaryEncoder.h @@ -243,7 +243,7 @@ class IntegerDictionaryEncoder : public AbstractIntegerDictionaryEncoder { [dictDataWriter = dictDataWriter_.get()]( Integer* const buf, const uint32_t size) mutable { if (dictDataWriter) { - dictDataWriter->add(buf, common::Ranges::of(0, size), nullptr); + dictDataWriter->add(buf, velox::common::Ranges::of(0, size), nullptr); dictDataWriter->flush(); } }); diff --git a/velox/dwio/dwrf/writer/StatisticsBuilderUtils.cpp b/velox/dwio/dwrf/writer/StatisticsBuilderUtils.cpp index 3babe373db2a..b072e93dcd59 100644 --- a/velox/dwio/dwrf/writer/StatisticsBuilderUtils.cpp +++ b/velox/dwio/dwrf/writer/StatisticsBuilderUtils.cpp @@ -21,7 +21,7 @@ namespace facebook::velox::dwrf { void StatisticsBuilderUtils::addValues( StatisticsBuilder& builder, const VectorPtr& vector, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { auto nulls = vector->rawNulls(); if (vector->mayHaveNulls()) { for (auto& pos : ranges) { @@ -39,7 +39,7 @@ void StatisticsBuilderUtils::addValues( void StatisticsBuilderUtils::addValues( BooleanStatisticsBuilder& builder, const VectorPtr& vector, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { auto nulls = vector->rawNulls(); auto vals = vector->as>()->asRange(); if (vector->mayHaveNulls()) { @@ -60,7 +60,7 @@ void StatisticsBuilderUtils::addValues( void StatisticsBuilderUtils::addValues( BooleanStatisticsBuilder& builder, const DecodedVector& vector, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { if (vector.mayHaveNulls()) { for (auto& pos : ranges) { if (vector.isNullAt(pos)) { @@ -79,7 +79,7 @@ void StatisticsBuilderUtils::addValues( void StatisticsBuilderUtils::addValues( StringStatisticsBuilder& builder, const VectorPtr& vector, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { auto nulls = vector->rawNulls(); auto data = vector->asFlatVector()->rawValues(); if (vector->mayHaveNulls()) { @@ -100,7 +100,7 @@ void StatisticsBuilderUtils::addValues( void StatisticsBuilderUtils::addValues( BinaryStatisticsBuilder& builder, const VectorPtr& vector, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { auto nulls = vector->rawNulls(); auto data = vector->asFlatVector()->rawValues(); if (vector->mayHaveNulls()) { diff --git a/velox/dwio/dwrf/writer/StatisticsBuilderUtils.h b/velox/dwio/dwrf/writer/StatisticsBuilderUtils.h index 6c0a2848b2f9..eda583082780 100644 --- a/velox/dwio/dwrf/writer/StatisticsBuilderUtils.h +++ b/velox/dwio/dwrf/writer/StatisticsBuilderUtils.h @@ -28,52 +28,52 @@ class StatisticsBuilderUtils { static void addValues( StatisticsBuilder& builder, const VectorPtr& vector, - const common::Ranges& ranges); + const velox::common::Ranges& ranges); static void addValues( BooleanStatisticsBuilder& builder, const VectorPtr& vector, - const common::Ranges& ranges); + const velox::common::Ranges& ranges); static void addValues( BooleanStatisticsBuilder& builder, const DecodedVector& vector, - const common::Ranges& ranges); + const velox::common::Ranges& ranges); template static void addValues( IntegerStatisticsBuilder& builder, const VectorPtr& vector, - const common::Ranges& ranges); + const velox::common::Ranges& ranges); template static void addValues( IntegerStatisticsBuilder& builder, const DecodedVector& vector, - const common::Ranges& ranges); + const velox::common::Ranges& ranges); template static void addValues( DoubleStatisticsBuilder& builder, const VectorPtr& vector, - const common::Ranges& ranges); + const velox::common::Ranges& ranges); static void addValues( StringStatisticsBuilder& builder, const VectorPtr& vector, - const common::Ranges& ranges); + const velox::common::Ranges& ranges); static void addValues( BinaryStatisticsBuilder& builder, const VectorPtr& vector, - const common::Ranges& ranges); + const velox::common::Ranges& ranges); }; template void StatisticsBuilderUtils::addValues( IntegerStatisticsBuilder& builder, const VectorPtr& vector, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { auto nulls = vector->rawNulls(); auto vals = vector->asFlatVector()->rawValues(); if (vector->mayHaveNulls()) { @@ -95,7 +95,7 @@ template void StatisticsBuilderUtils::addValues( IntegerStatisticsBuilder& builder, const DecodedVector& vector, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { if (vector.mayHaveNulls()) { for (auto& pos : ranges) { if (vector.isNullAt(pos)) { @@ -115,7 +115,7 @@ template void StatisticsBuilderUtils::addValues( DoubleStatisticsBuilder& builder, const VectorPtr& vector, - const common::Ranges& ranges) { + const velox::common::Ranges& ranges) { auto nulls = vector->rawNulls(); auto vals = vector->asFlatVector()->rawValues(); if (vector->mayHaveNulls()) { diff --git a/velox/dwio/dwrf/writer/Writer.cpp b/velox/dwio/dwrf/writer/Writer.cpp index b5af93a2cc1b..05da5d953ab5 100644 --- a/velox/dwio/dwrf/writer/Writer.cpp +++ b/velox/dwio/dwrf/writer/Writer.cpp @@ -306,7 +306,7 @@ void Writer::write(const VectorPtr& input) { } const auto rawSize = writer_->write( - input, common::Ranges::of(rowOffset, rowOffset + numRowsToWrite)); + input, velox::common::Ranges::of(rowOffset, rowOffset + numRowsToWrite)); rowOffset += numRowsToWrite; context.incRawSize(rawSize); diff --git a/velox/dwio/dwrf/writer/Writer.h b/velox/dwio/dwrf/writer/Writer.h index 3864b57790d1..8b7c7e374c3e 100644 --- a/velox/dwio/dwrf/writer/Writer.h +++ b/velox/dwio/dwrf/writer/Writer.h @@ -210,7 +210,7 @@ class Writer : public dwio::common::Writer { } const std::shared_ptr schema_; - const common::SpillConfig* const spillConfig_; + const velox::common::SpillConfig* const spillConfig_; // If not null, used by memory arbitration to track if this file writer is // under memory reclaimable section or not. tsan_atomic* const nonReclaimableSection_{nullptr}; diff --git a/velox/dwio/dwrf/writer/WriterBase.cpp b/velox/dwio/dwrf/writer/WriterBase.cpp index 6fd51477d048..5e16cd9b138f 100644 --- a/velox/dwio/dwrf/writer/WriterBase.cpp +++ b/velox/dwio/dwrf/writer/WriterBase.cpp @@ -64,13 +64,13 @@ void WriterBase::writeFooter(const Type& type) { ps.set_compression( static_cast(context_->compression())); if (context_->compression() != - common::CompressionKind::CompressionKind_NONE) { + velox::common::CompressionKind::CompressionKind_NONE) { ps.set_compressionblocksize(context_->compressionBlockSize()); } ps.set_cachemode( static_cast(writerSink_->getCacheMode())); ps.set_cachesize(cacheSize); - writeProto(ps, common::CompressionKind::CompressionKind_NONE); + writeProto(ps, velox::common::CompressionKind::CompressionKind_NONE); auto psLength = writerSink_->size() - pos; DWIO_ENSURE_LE(psLength, 0xff, "PostScript is too large: ", psLength); auto psLen = static_cast(psLength); diff --git a/velox/dwio/dwrf/writer/WriterBase.h b/velox/dwio/dwrf/writer/WriterBase.h index e79b9293db9b..cc745368c69f 100644 --- a/velox/dwio/dwrf/writer/WriterBase.h +++ b/velox/dwio/dwrf/writer/WriterBase.h @@ -103,7 +103,7 @@ class WriterBase { } template - void writeProto(const T& t, common::CompressionKind kind) { + void writeProto(const T& t, velox::common::CompressionKind kind) { auto holder = context_->newDataBufferHolder(); auto stream = context_->newStream(kind, *holder); diff --git a/velox/dwio/dwrf/writer/WriterContext.cpp b/velox/dwio/dwrf/writer/WriterContext.cpp index ce02aacf2921..6b63497ddd51 100644 --- a/velox/dwio/dwrf/writer/WriterContext.cpp +++ b/velox/dwio/dwrf/writer/WriterContext.cpp @@ -69,7 +69,7 @@ WriterContext::WriterContext( } validateConfigs(); VLOG(2) << fmt::format( - "Compression config: {}", common::compressionKindToString(compression_)); + "Compression config: {}", velox::common::compressionKindToString(compression_)); } WriterContext::~WriterContext() { @@ -96,7 +96,7 @@ void WriterContext::validateConfigs() const { void WriterContext::initBuffer() { VELOX_CHECK_NULL(compressionBuffer_); - if (compression_ != common::CompressionKind_NONE) { + if (compression_ != velox::common::CompressionKind_NONE) { compressionBuffer_ = std::make_unique>( *generalPool_, compressionBlockSize_ + PAGE_HEADER_SIZE); } diff --git a/velox/dwio/dwrf/writer/WriterContext.h b/velox/dwio/dwrf/writer/WriterContext.h index 9ba444d53175..532ae0196b9e 100644 --- a/velox/dwio/dwrf/writer/WriterContext.h +++ b/velox/dwio/dwrf/writer/WriterContext.h @@ -103,7 +103,7 @@ class WriterContext : public CompressionBufferPool { } std::unique_ptr newStream( - common::CompressionKind kind, + velox::common::CompressionKind kind, DataBufferHolder& holder, const dwio::common::encryption::Encrypter* encrypter = nullptr) { return createCompressor(kind, *this, holder, *config_, encrypter); @@ -151,7 +151,7 @@ class WriterContext : public CompressionBufferPool { } bool isStreamPaged(uint32_t nodeId) const { - return (compression_ != common::CompressionKind::CompressionKind_NONE) || + return (compression_ != velox::common::CompressionKind::CompressionKind_NONE) || handler_->isEncrypted(nodeId); } @@ -386,7 +386,7 @@ class WriterContext : public CompressionBufferPool { return indexStride_; } - common::CompressionKind compression() const { + velox::common::CompressionKind compression() const { return compression_; } @@ -629,7 +629,7 @@ class WriterContext : public CompressionBufferPool { // config const bool indexEnabled_; const uint32_t indexStride_; - const common::CompressionKind compression_; + const velox::common::CompressionKind compression_; const uint64_t compressionBlockSize_; const bool shareFlatMapDictionaries_; const uint64_t stripeSizeFlushThreshold_; diff --git a/velox/dwio/orc/test/ReaderFilterTest.cpp b/velox/dwio/orc/test/ReaderFilterTest.cpp index 329286f00791..d3fbffc1dbfc 100644 --- a/velox/dwio/orc/test/ReaderFilterTest.cpp +++ b/velox/dwio/orc/test/ReaderFilterTest.cpp @@ -92,23 +92,23 @@ INSTANTIATE_TEST_SUITE_P( // "e": 1.1, float OrcReaderFilterParam{ "e", - std::make_shared>(1.0, false, false, 2.0, false, false, false), 1}, OrcReaderFilterParam{ "e", - std::make_shared>(2.0, false, false, 3.0, false, false, false), 0}, // "f": 1.12, double OrcReaderFilterParam{ "f", - std::make_shared>(1.0, false, false, 2.0, false, false, false), 1}, OrcReaderFilterParam{ "f", - std::make_shared>(2.0, false, false, 3.0, false, false, false), 0}, // "g": "velox", varchar @@ -212,7 +212,7 @@ TEST_P(OrcReaderFilterTestP, tests) { // auto rowType = DataSetBuilder::makeRowType(schema, true); // auto filterGenerator = std::make_unique(rowType); - auto scanSpec = std::make_shared(""); + auto scanSpec = std::make_shared(""); scanSpec->addAllChildFields(*schema); std::string fileName = "orc_all_type.orc"; diff --git a/velox/dwio/orc/test/ReaderTest.cpp b/velox/dwio/orc/test/ReaderTest.cpp index cad607b5761d..d323f989cc54 100644 --- a/velox/dwio/orc/test/ReaderTest.cpp +++ b/velox/dwio/orc/test/ReaderTest.cpp @@ -254,7 +254,7 @@ TEST_F(OrcReaderTest, testOrcRlev2) { const std::string dateOrc(getExamplesFilePath("rlev2.orc")); auto schema = ROW({"id", "price", "name"}, {BIGINT(), DECIMAL(7, 2), VARCHAR()}); - auto spec = std::make_shared(""); + auto spec = std::make_shared(""); spec->addAllChildFields(*schema); dwio::common::ReaderOptions readerOpts{pool()}; @@ -297,7 +297,7 @@ class OrcFileDescription { uint64_t rowCount; uint64_t contentLength; uint64_t stripeCount; - common::CompressionKind compression; + velox::common::CompressionKind compression; size_t compressionSize; uint64_t rowIndexStride; std::map userMeta; @@ -311,7 +311,7 @@ class OrcFileDescription { uint64_t _rowCount, uint64_t _contentLength, uint64_t _stripeCount, - common::CompressionKind _compression, + velox::common::CompressionKind _compression, size_t _compressionSize, uint64_t _rowIndexStride, const std::map& _meta) @@ -406,7 +406,7 @@ TEST_P(OrcReaderTestP, DwrfRowReader_ReadAllColumnTypes_ExpectedRowDataRead) { std::string schemaString = GetParam().typeString; auto type = HiveTypeParser().parse(schemaString); auto schema = std::dynamic_pointer_cast(type); - auto scanSpec = std::make_shared(""); + auto scanSpec = std::make_shared(""); scanSpec->addAllChildFields(*schema); const std::string dateOrc(getFilename()); @@ -448,7 +448,7 @@ INSTANTIATE_TEST_SUITE_P( 21000, 428406, 5, - common::CompressionKind::CompressionKind_NONE, + velox::common::CompressionKind::CompressionKind_NONE, 262144, 1000, std::map()), @@ -461,7 +461,7 @@ INSTANTIATE_TEST_SUITE_P( 50000, 214643, 10, - common::CompressionKind::CompressionKind_SNAPPY, + velox::common::CompressionKind::CompressionKind_SNAPPY, 1000, 0, std::map())), diff --git a/velox/dwio/parquet/reader/BooleanColumnReader.h b/velox/dwio/parquet/reader/BooleanColumnReader.h index 458cc972bbbb..bb20fd755e64 100644 --- a/velox/dwio/parquet/reader/BooleanColumnReader.h +++ b/velox/dwio/parquet/reader/BooleanColumnReader.h @@ -28,7 +28,7 @@ class BooleanColumnReader : public dwio::common::SelectiveByteRleColumnReader { const TypePtr& requestedType, std::shared_ptr fileType, ParquetParams& params, - common::ScanSpec& scanSpec) + velox::common::ScanSpec& scanSpec) : SelectiveByteRleColumnReader( requestedType, std::move(fileType), diff --git a/velox/dwio/parquet/reader/FloatingPointColumnReader.h b/velox/dwio/parquet/reader/FloatingPointColumnReader.h index cac475c0ee94..eedce755cc87 100644 --- a/velox/dwio/parquet/reader/FloatingPointColumnReader.h +++ b/velox/dwio/parquet/reader/FloatingPointColumnReader.h @@ -34,7 +34,7 @@ class FloatingPointColumnReader const TypePtr& requestedType, std::shared_ptr fileType, ParquetParams& params, - common::ScanSpec& scanSpec); + velox::common::ScanSpec& scanSpec); void seekToRowGroup(int64_t index) override { base::seekToRowGroup(index); @@ -61,7 +61,7 @@ FloatingPointColumnReader::FloatingPointColumnReader( const TypePtr& requestedType, std::shared_ptr fileType, ParquetParams& params, - common::ScanSpec& scanSpec) + velox::common::ScanSpec& scanSpec) : dwio::common::SelectiveFloatingPointColumnReader( requestedType, std::move(fileType), diff --git a/velox/dwio/parquet/reader/IntegerColumnReader.h b/velox/dwio/parquet/reader/IntegerColumnReader.h index 8c2aa2b4df16..6af6b8ee483f 100644 --- a/velox/dwio/parquet/reader/IntegerColumnReader.h +++ b/velox/dwio/parquet/reader/IntegerColumnReader.h @@ -26,7 +26,7 @@ class IntegerColumnReader : public dwio::common::SelectiveIntegerColumnReader { const TypePtr& requestedType, std::shared_ptr fileType, ParquetParams& params, - common::ScanSpec& scanSpec) + velox::common::ScanSpec& scanSpec) : SelectiveIntegerColumnReader( requestedType, params, diff --git a/velox/dwio/parquet/reader/Metadata.cpp b/velox/dwio/parquet/reader/Metadata.cpp index 8920b0ea400f..897b86dff9b2 100644 --- a/velox/dwio/parquet/reader/Metadata.cpp +++ b/velox/dwio/parquet/reader/Metadata.cpp @@ -155,25 +155,25 @@ common::CompressionKind thriftCodecToCompressionKind( thrift::CompressionCodec::type codec) { switch (codec) { case thrift::CompressionCodec::UNCOMPRESSED: - return common::CompressionKind::CompressionKind_NONE; + return velox::common::CompressionKind::CompressionKind_NONE; break; case thrift::CompressionCodec::SNAPPY: - return common::CompressionKind::CompressionKind_SNAPPY; + return velox::common::CompressionKind::CompressionKind_SNAPPY; break; case thrift::CompressionCodec::GZIP: - return common::CompressionKind::CompressionKind_GZIP; + return velox::common::CompressionKind::CompressionKind_GZIP; break; case thrift::CompressionCodec::LZO: - return common::CompressionKind::CompressionKind_LZO; + return velox::common::CompressionKind::CompressionKind_LZO; break; case thrift::CompressionCodec::LZ4: - return common::CompressionKind::CompressionKind_LZ4; + return velox::common::CompressionKind::CompressionKind_LZ4; break; case thrift::CompressionCodec::ZSTD: - return common::CompressionKind::CompressionKind_ZSTD; + return velox::common::CompressionKind::CompressionKind_ZSTD; break; case thrift::CompressionCodec::LZ4_RAW: - return common::CompressionKind::CompressionKind_LZ4; + return velox::common::CompressionKind::CompressionKind_LZ4; default: VELOX_UNSUPPORTED( "Unsupported compression type: " + diff --git a/velox/dwio/parquet/reader/Metadata.h b/velox/dwio/parquet/reader/Metadata.h index a0241828b546..85c186b7ca7c 100644 --- a/velox/dwio/parquet/reader/Metadata.h +++ b/velox/dwio/parquet/reader/Metadata.h @@ -62,7 +62,7 @@ class ColumnChunkMetaDataPtr { int64_t dictionaryPageOffset() const; /// The compression. - common::CompressionKind compression() const; + velox::common::CompressionKind compression() const; /// Total byte size of all the compressed (and potentially encrypted) /// column data in this row group. diff --git a/velox/dwio/parquet/reader/PageReader.cpp b/velox/dwio/parquet/reader/PageReader.cpp index 115a77b7bf36..a5d9ec900007 100644 --- a/velox/dwio/parquet/reader/PageReader.cpp +++ b/velox/dwio/parquet/reader/PageReader.cpp @@ -327,7 +327,7 @@ void PageReader::prepareDictionary(const PageHeader& pageHeader) { dictionaryEncoding_ == Encoding::PLAIN_DICTIONARY || dictionaryEncoding_ == Encoding::PLAIN); - if (codec_ != common::CompressionKind::CompressionKind_NONE) { + if (codec_ != velox::common::CompressionKind::CompressionKind_NONE) { pageData_ = readBytes(pageHeader.compressed_page_size, pageBuffer_); pageData_ = decompressData( pageData_, diff --git a/velox/dwio/parquet/reader/PageReader.h b/velox/dwio/parquet/reader/PageReader.h index c377100428a7..8e31c5dee659 100644 --- a/velox/dwio/parquet/reader/PageReader.h +++ b/velox/dwio/parquet/reader/PageReader.h @@ -40,7 +40,7 @@ class PageReader { std::unique_ptr stream, memory::MemoryPool& pool, ParquetTypeWithIdPtr fileType, - common::CompressionKind codec, + velox::common::CompressionKind codec, int64_t chunkSize, const tz::TimeZone* sessionTimezone) : pool_(pool), @@ -60,7 +60,7 @@ class PageReader { PageReader( std::unique_ptr stream, memory::MemoryPool& pool, - common::CompressionKind codec, + velox::common::CompressionKind codec, int64_t chunkSize, const tz::TimeZone* sessionTimezone = nullptr) : pool_(pool), @@ -377,7 +377,7 @@ class PageReader { const int32_t maxDefine_; const bool isTopLevel_; - const common::CompressionKind codec_; + const velox::common::CompressionKind codec_; const int64_t chunkSize_; const char* bufferStart_{nullptr}; const char* bufferEnd_{nullptr}; @@ -516,16 +516,16 @@ class PageReader { }; FOLLY_ALWAYS_INLINE dwio::common::compression::CompressionOptions -getParquetDecompressionOptions(common::CompressionKind kind) { +getParquetDecompressionOptions(velox::common::CompressionKind kind) { dwio::common::compression::CompressionOptions options{}; - if (kind == common::CompressionKind_ZLIB || - kind == common::CompressionKind_GZIP) { + if (kind == velox::common::CompressionKind_ZLIB || + kind == velox::common::CompressionKind_GZIP) { options.format.zlib.windowBits = dwio::common::compression::Compressor::PARQUET_ZLIB_WINDOW_BITS; } else if ( - kind == common::CompressionKind_LZ4 || - kind == common::CompressionKind_LZO) { + kind == velox::common::CompressionKind_LZ4 || + kind == velox::common::CompressionKind_LZO) { options.format.lz4_lzo.isHadoopFrameFormat = true; } return options; @@ -534,7 +534,7 @@ getParquetDecompressionOptions(common::CompressionKind kind) { template void PageReader::readWithVisitor(Visitor& visitor) { constexpr bool hasFilter = - !std::is_same_v; + !std::is_same_v; constexpr bool filterOnly = std::is_same_v; bool mayProduceNulls = !filterOnly && visitor.allowNulls(); diff --git a/velox/dwio/parquet/reader/ParquetColumnReader.cpp b/velox/dwio/parquet/reader/ParquetColumnReader.cpp index 2e6665dc39cd..c7d7d4ed0d9e 100644 --- a/velox/dwio/parquet/reader/ParquetColumnReader.cpp +++ b/velox/dwio/parquet/reader/ParquetColumnReader.cpp @@ -37,7 +37,7 @@ std::unique_ptr ParquetColumnReader::build( const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, memory::MemoryPool& pool, bool useColumnNames) { auto colName = scanSpec.fieldName(); diff --git a/velox/dwio/parquet/reader/ParquetColumnReader.h b/velox/dwio/parquet/reader/ParquetColumnReader.h index 363e0c0b6768..0f1e6e00b590 100644 --- a/velox/dwio/parquet/reader/ParquetColumnReader.h +++ b/velox/dwio/parquet/reader/ParquetColumnReader.h @@ -45,7 +45,7 @@ class ParquetColumnReader { const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, memory::MemoryPool& pool, bool useColumnNames); }; diff --git a/velox/dwio/parquet/reader/ParquetData.cpp b/velox/dwio/parquet/reader/ParquetData.cpp index 29a593da414c..ca62e9438e25 100644 --- a/velox/dwio/parquet/reader/ParquetData.cpp +++ b/velox/dwio/parquet/reader/ParquetData.cpp @@ -23,13 +23,13 @@ namespace facebook::velox::parquet { std::unique_ptr ParquetParams::toFormatData( const std::shared_ptr& type, - const common::ScanSpec& /*scanSpec*/) { + const velox::common::ScanSpec& /*scanSpec*/) { return std::make_unique( type, metaData_, pool(), sessionTimezone_); } void ParquetData::filterRowGroups( - const common::ScanSpec& scanSpec, + const velox::common::ScanSpec& scanSpec, uint64_t /*rowsPerRowGroup*/, const dwio::common::StatsContext& writerContext, FilterRowGroupsResult& result) { @@ -70,7 +70,7 @@ void ParquetData::filterRowGroups( } } -bool ParquetData::rowGroupMatches(uint32_t rowGroupId, common::Filter* filter) { +bool ParquetData::rowGroupMatches(uint32_t rowGroupId, velox::common::Filter* filter) { auto column = type_->column(); auto type = type_->type(); auto rowGroup = fileMetaDataPtr_.rowGroup(rowGroupId); @@ -107,7 +107,7 @@ void ParquetData::enqueueRowGroup( } uint64_t readSize = - (chunk.compression() == common::CompressionKind::CompressionKind_NONE) + (chunk.compression() == velox::common::CompressionKind::CompressionKind_NONE) ? chunk.totalUncompressedSize() : chunk.totalCompressedSize(); diff --git a/velox/dwio/parquet/reader/ParquetData.h b/velox/dwio/parquet/reader/ParquetData.h index fe8020f57c65..a73b7913712c 100644 --- a/velox/dwio/parquet/reader/ParquetData.h +++ b/velox/dwio/parquet/reader/ParquetData.h @@ -44,7 +44,7 @@ class ParquetParams : public dwio::common::FormatParams { timestampPrecision_(timestampPrecision) {} std::unique_ptr toFormatData( const std::shared_ptr& type, - const common::ScanSpec& scanSpec) override; + const velox::common::ScanSpec& scanSpec) override; TimestampPrecision timestampPrecision() const { return timestampPrecision_; @@ -81,7 +81,7 @@ class ParquetData : public dwio::common::FormatData { dwio::common::PositionProvider seekToRowGroup(int64_t index) override; void filterRowGroups( - const common::ScanSpec& scanSpec, + const velox::common::ScanSpec& scanSpec, uint64_t rowsPerRowGroup, const dwio::common::StatsContext& writerContext, FilterRowGroupsResult&) override; @@ -206,7 +206,7 @@ class ParquetData : public dwio::common::FormatData { private: /// True if 'filter' may have hits for the column of 'this' according to the /// stats in 'rowGroup'. - bool rowGroupMatches(uint32_t rowGroupId, common::Filter* filter); + bool rowGroupMatches(uint32_t rowGroupId, velox::common::Filter* filter); protected: memory::MemoryPool& pool_; diff --git a/velox/dwio/parquet/reader/RepeatedColumnReader.cpp b/velox/dwio/parquet/reader/RepeatedColumnReader.cpp index 1c7df63610c0..d71541ea17b1 100644 --- a/velox/dwio/parquet/reader/RepeatedColumnReader.cpp +++ b/velox/dwio/parquet/reader/RepeatedColumnReader.cpp @@ -116,7 +116,7 @@ MapColumnReader::MapColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, memory::MemoryPool& pool, bool useColumnNames) : dwio::common::SelectiveMapColumnReader( @@ -237,7 +237,7 @@ ListColumnReader::ListColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, memory::MemoryPool& pool, bool useColumnNames) : dwio::common::SelectiveListColumnReader( diff --git a/velox/dwio/parquet/reader/RepeatedColumnReader.h b/velox/dwio/parquet/reader/RepeatedColumnReader.h index c731f021c446..8cb5fd5dff30 100644 --- a/velox/dwio/parquet/reader/RepeatedColumnReader.h +++ b/velox/dwio/parquet/reader/RepeatedColumnReader.h @@ -59,7 +59,7 @@ class MapColumnReader : public dwio::common::SelectiveMapColumnReader { const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, memory::MemoryPool& pool, bool useColumnNames); @@ -117,7 +117,7 @@ class ListColumnReader : public dwio::common::SelectiveListColumnReader { const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, memory::MemoryPool& pool, bool useColumnNames); diff --git a/velox/dwio/parquet/reader/RleBpDataDecoder.h b/velox/dwio/parquet/reader/RleBpDataDecoder.h index 29cf69e1a4ce..007ecf4999c7 100644 --- a/velox/dwio/parquet/reader/RleBpDataDecoder.h +++ b/velox/dwio/parquet/reader/RleBpDataDecoder.h @@ -102,7 +102,7 @@ class RleBpDataDecoder : public facebook::velox::parquet::RleBpDecoder { template void fastPath(const uint64_t* nulls, Visitor& visitor) { constexpr bool hasFilter = - !std::is_same_v; + !std::is_same_v; constexpr bool hasHook = !std::is_same_v; auto rows = visitor.rows(); diff --git a/velox/dwio/parquet/reader/StringColumnReader.cpp b/velox/dwio/parquet/reader/StringColumnReader.cpp index ac678b7f0a39..e71ff27542a0 100644 --- a/velox/dwio/parquet/reader/StringColumnReader.cpp +++ b/velox/dwio/parquet/reader/StringColumnReader.cpp @@ -23,7 +23,7 @@ namespace facebook::velox::parquet { StringColumnReader::StringColumnReader( const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec) + velox::common::ScanSpec& scanSpec) : SelectiveColumnReader(fileType->type(), fileType, params, scanSpec) {} uint64_t StringColumnReader::skip(uint64_t numValues) { diff --git a/velox/dwio/parquet/reader/StringColumnReader.h b/velox/dwio/parquet/reader/StringColumnReader.h index 4f60220efa21..0a174afcd6f2 100644 --- a/velox/dwio/parquet/reader/StringColumnReader.h +++ b/velox/dwio/parquet/reader/StringColumnReader.h @@ -27,7 +27,7 @@ class StringColumnReader : public dwio::common::SelectiveColumnReader { StringColumnReader( const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec); + velox::common::ScanSpec& scanSpec); bool hasBulkPath() const override { // Non-dictionary encodings do not have fast path. diff --git a/velox/dwio/parquet/reader/StructColumnReader.cpp b/velox/dwio/parquet/reader/StructColumnReader.cpp index f8242fd2d1ed..9fddc957f0d7 100644 --- a/velox/dwio/parquet/reader/StructColumnReader.cpp +++ b/velox/dwio/parquet/reader/StructColumnReader.cpp @@ -30,7 +30,7 @@ StructColumnReader::StructColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, memory::MemoryPool& pool, bool useColumnNames) : SelectiveStructColumnReader( diff --git a/velox/dwio/parquet/reader/StructColumnReader.h b/velox/dwio/parquet/reader/StructColumnReader.h index 0a21652ea806..ba9e7a55bcb1 100644 --- a/velox/dwio/parquet/reader/StructColumnReader.h +++ b/velox/dwio/parquet/reader/StructColumnReader.h @@ -35,7 +35,7 @@ class StructColumnReader : public dwio::common::SelectiveStructColumnReader { const TypePtr& requestedType, const std::shared_ptr& fileType, ParquetParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, memory::MemoryPool& pool, bool useColumnNames); diff --git a/velox/dwio/parquet/reader/TimestampColumnReader.h b/velox/dwio/parquet/reader/TimestampColumnReader.h index df832d3daeac..b49745abd888 100644 --- a/velox/dwio/parquet/reader/TimestampColumnReader.h +++ b/velox/dwio/parquet/reader/TimestampColumnReader.h @@ -45,7 +45,7 @@ Timestamp toInt96Timestamp(const int128_t& value) { // Range filter for Parquet Timestamp. template -class ParquetTimestampRange final : public common::TimestampRange { +class ParquetTimestampRange final : public velox::common::TimestampRange { public: // Use int128_t for Int96 static_assert(std::is_same_v || std::is_same_v); @@ -89,7 +89,7 @@ class TimestampColumnReader : public IntegerColumnReader { const TypePtr& requestedType, std::shared_ptr fileType, ParquetParams& params, - common::ScanSpec& scanSpec) + velox::common::ScanSpec& scanSpec) : IntegerColumnReader(requestedType, fileType, params, scanSpec), requestedPrecision_(params.timestampPrecision()) { if constexpr (std::is_same_v) { @@ -173,7 +173,7 @@ class TimestampColumnReader : public IntegerColumnReader { rows, dwio::common::ColumnVisitor< int128_t, - common::TimestampRange, + velox::common::TimestampRange, ExtractValues, isDense>(newRange, this, rows, extractValues)); } else { diff --git a/velox/dwio/parquet/tests/ParquetTpchTest.cpp b/velox/dwio/parquet/tests/ParquetTpchTest.cpp index 1ef7a9775b65..ccc603744678 100644 --- a/velox/dwio/parquet/tests/ParquetTpchTest.cpp +++ b/velox/dwio/parquet/tests/ParquetTpchTest.cpp @@ -54,39 +54,39 @@ class ParquetTpchTest : public testing::Test { parquet::registerParquetReaderFactory(); parquet::registerParquetWriterFactory(); - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); auto hiveConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector( kHiveConnectorId, std::make_shared( std::unordered_map())); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); auto tpchConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::tpch::TpchConnectorFactory::kTpchConnectorName) ->newConnector( kTpchConnectorId, std::make_shared( std::unordered_map())); - connector::registerConnector(tpchConnector); + connector::common::registerConnector(tpchConnector); saveTpchTablesAsParquet(); tpchBuilder_->initialize(tempDirectory_->getPath()); } static void TearDownTestSuite() { - connector::unregisterConnectorFactory( + connector::common::unregisterConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName); - connector::unregisterConnectorFactory( + connector::common::unregisterConnectorFactory( connector::tpch::TpchConnectorFactory::kTpchConnectorName); - connector::unregisterConnector(kHiveConnectorId); - connector::unregisterConnector(kTpchConnectorId); + connector::common::unregisterConnector(kHiveConnectorId); + connector::common::unregisterConnector(kTpchConnectorId); parquet::unregisterParquetReaderFactory(); parquet::unregisterParquetWriterFactory(); } diff --git a/velox/dwio/parquet/tests/reader/E2EFilterTest.cpp b/velox/dwio/parquet/tests/reader/E2EFilterTest.cpp index f5f608d812aa..434f39e84c15 100644 --- a/velox/dwio/parquet/tests/reader/E2EFilterTest.cpp +++ b/velox/dwio/parquet/tests/reader/E2EFilterTest.cpp @@ -152,10 +152,10 @@ TEST_F(E2EFilterTest, integerDeltaBinaryPack) { TEST_F(E2EFilterTest, compression) { for (const auto compression : {common::CompressionKind_SNAPPY, - common::CompressionKind_ZSTD, - common::CompressionKind_GZIP, - common::CompressionKind_NONE, - common::CompressionKind_LZ4}) { + velox::common::CompressionKind_ZSTD, + velox::common::CompressionKind_GZIP, + velox::common::CompressionKind_NONE, + velox::common::CompressionKind_LZ4}) { if (!facebook::velox::parquet::Writer::isCodecAvailable(compression)) { continue; } diff --git a/velox/dwio/parquet/tests/reader/ParquetPageReaderTest.cpp b/velox/dwio/parquet/tests/reader/ParquetPageReaderTest.cpp index 5145dcfdc8ca..3dbba6389484 100644 --- a/velox/dwio/parquet/tests/reader/ParquetPageReaderTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetPageReaderTest.cpp @@ -34,7 +34,7 @@ TEST_F(ParquetPageReaderTest, smallPage) { auto pageReader = std::make_unique( std::move(inputStream), *leafPool_, - common::CompressionKind::CompressionKind_GZIP, + velox::common::CompressionKind::CompressionKind_GZIP, headerSize); auto header = pageReader->readPageHeader(); EXPECT_EQ(header.type, thrift::PageType::type::DATA_PAGE); @@ -62,7 +62,7 @@ TEST_F(ParquetPageReaderTest, largePage) { auto pageReader = std::make_unique( std::move(inputStream), *leafPool_, - common::CompressionKind::CompressionKind_GZIP, + velox::common::CompressionKind::CompressionKind_GZIP, headerSize); auto header = pageReader->readPageHeader(); @@ -95,7 +95,7 @@ TEST_F(ParquetPageReaderTest, corruptedPageHeader) { auto pageReader = std::make_unique( std::move(inputStream), *leafPool_, - common::CompressionKind::CompressionKind_GZIP, + velox::common::CompressionKind::CompressionKind_GZIP, headerSize); EXPECT_THROW(pageReader->readPageHeader(), VeloxException); diff --git a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp index 5741b349e695..eb138cc68d03 100644 --- a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp @@ -71,7 +71,7 @@ class ParquetTableScanTest : public HiveConnectorTestBase { std::vector&& outputColumnNames, std::unordered_map< std::string, - std::shared_ptr>& assignments, + std::shared_ptr>& assignments, const std::string& sql) { auto rowType = getRowType(std::move(outputColumnNames)); auto plan = PlanBuilder() @@ -87,7 +87,7 @@ class ParquetTableScanTest : public HiveConnectorTestBase { const std::string& sql, const std::unordered_map< std::string, - std::shared_ptr>& assignments = + std::shared_ptr>& assignments = {}) { auto rowType = getRowType(std::move(outputColumnNames)); parse::ParseOptions options; @@ -297,7 +297,7 @@ class ParquetTableScanTest : public HiveConnectorTestBase { "SELECT t from tmp where t == TIMESTAMP '2022-12-23 03:56:01'"); } - const std::vector>& splits() + const std::vector>& splits() const { return splits_; } @@ -313,7 +313,7 @@ class ParquetTableScanTest : public HiveConnectorTestBase { } RowTypePtr rowType_; - std::vector> splits_; + std::vector> splits_; TimestampPrecision timestampPrecision_ = TimestampPrecision::kMicroseconds; }; @@ -422,7 +422,7 @@ TEST_F(ParquetTableScanTest, aggregatePushdown) { .tableScan(outputType, {"c1 = 1"}, "") .singleAggregation({"c2"}, {"sum(c3)"}) .planNode(); - std::vector> splits; + std::vector> splits; splits.push_back(makeSplit(getExampleFilePath("gcc_data_diff.parquet"))); auto result = AssertQueryBuilder(plan).splits(splits).copyResults(pool()); ASSERT_EQ(result->size(), 5); @@ -723,7 +723,7 @@ TEST_F(ParquetTableScanTest, rowIndex) { std::unordered_map{{kPath, filePath}}); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments; assignments["a"] = std::make_shared( "a", @@ -807,9 +807,9 @@ TEST_F(ParquetTableScanTest, filterNullIcebergPartition) { std::unordered_map>{ {"c1", std::nullopt}}); - std::shared_ptr c0 = makeColumnHandle( + std::shared_ptr c0 = makeColumnHandle( "c0", BIGINT(), BIGINT(), {}, HiveColumnHandle::ColumnType::kRegular); - std::shared_ptr c1 = makeColumnHandle( + std::shared_ptr c1 = makeColumnHandle( "c1", BIGINT(), BIGINT(), @@ -823,7 +823,7 @@ TEST_F(ParquetTableScanTest, filterNullIcebergPartition) { "SELECT c0, c1 FROM tmp WHERE c1 IS NOT NULL", std::unordered_map< std::string, - std::shared_ptr>{ + std::shared_ptr>{ {"c0", c0}, {"c1", c1}}); assertSelectWithFilter( @@ -833,7 +833,7 @@ TEST_F(ParquetTableScanTest, filterNullIcebergPartition) { "SELECT c0, c1 FROM tmp WHERE c1 IS NULL", std::unordered_map< std::string, - std::shared_ptr>{ + std::shared_ptr>{ {"c0", c0}, {"c1", c1}}); } @@ -1350,15 +1350,15 @@ TEST_F(ParquetTableScanTest, booleanRle) { writeToParquetFile(file->getPath(), {vector}, options); loadData(file->getPath(), schema, vector); - std::shared_ptr c0 = makeColumnHandle( + std::shared_ptr c0 = makeColumnHandle( "c0", BOOLEAN(), BOOLEAN(), {}, HiveColumnHandle::ColumnType::kRegular); - std::shared_ptr c1 = makeColumnHandle( + std::shared_ptr c1 = makeColumnHandle( "c1", BOOLEAN(), BOOLEAN(), {}, HiveColumnHandle::ColumnType::kRegular); - std::shared_ptr c2 = makeColumnHandle( + std::shared_ptr c2 = makeColumnHandle( "c2", BOOLEAN(), BOOLEAN(), {}, HiveColumnHandle::ColumnType::kRegular); - std::shared_ptr c3 = makeColumnHandle( + std::shared_ptr c3 = makeColumnHandle( "c3", BOOLEAN(), BOOLEAN(), {}, HiveColumnHandle::ColumnType::kRegular); - std::shared_ptr c4 = makeColumnHandle( + std::shared_ptr c4 = makeColumnHandle( "c4", BOOLEAN(), BOOLEAN(), {}, HiveColumnHandle::ColumnType::kRegular); assertSelect({"c0"}, "SELECT c0 FROM tmp"); @@ -1386,11 +1386,11 @@ TEST_F(ParquetTableScanTest, singleBooleanRle) { writeToParquetFile(file->getPath(), {vector}, options); loadData(file->getPath(), schema, vector); - std::shared_ptr c0 = makeColumnHandle( + std::shared_ptr c0 = makeColumnHandle( "c0", BOOLEAN(), BOOLEAN(), {}, HiveColumnHandle::ColumnType::kRegular); - std::shared_ptr c1 = makeColumnHandle( + std::shared_ptr c1 = makeColumnHandle( "c1", BOOLEAN(), BOOLEAN(), {}, HiveColumnHandle::ColumnType::kRegular); - std::shared_ptr c2 = makeColumnHandle( + std::shared_ptr c2 = makeColumnHandle( "c2", BOOLEAN(), BOOLEAN(), {}, HiveColumnHandle::ColumnType::kRegular); assertSelect({"c0"}, "SELECT c0 FROM tmp"); diff --git a/velox/dwio/parquet/tests/writer/ParquetWriterTest.cpp b/velox/dwio/parquet/tests/writer/ParquetWriterTest.cpp index 3dc9f93990eb..ba2f1da1bbb1 100644 --- a/velox/dwio/parquet/tests/writer/ParquetWriterTest.cpp +++ b/velox/dwio/parquet/tests/writer/ParquetWriterTest.cpp @@ -45,16 +45,16 @@ class ParquetWriterTest : public ParquetTestBase { memory::MemoryManager::testingSetInstance(memory::MemoryManager::Options{}); testutil::TestValue::enable(); filesystems::registerLocalFileSystem(); - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); auto hiveConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector( kHiveConnectorId, std::make_shared( std::unordered_map())); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); parquet::registerParquetWriterFactory(); } diff --git a/velox/dwio/parquet/writer/Writer.cpp b/velox/dwio/parquet/writer/Writer.cpp index 895f88a57ae9..df9d0e6ccccc 100644 --- a/velox/dwio/parquet/writer/Writer.cpp +++ b/velox/dwio/parquet/writer/Writer.cpp @@ -108,16 +108,16 @@ struct ArrowContext { }; Compression::type getArrowParquetCompression( - common::CompressionKind compression) { - if (compression == common::CompressionKind_SNAPPY) { + velox::common::CompressionKind compression) { + if (compression == velox::common::CompressionKind_SNAPPY) { return Compression::SNAPPY; - } else if (compression == common::CompressionKind_GZIP) { + } else if (compression == velox::common::CompressionKind_GZIP) { return Compression::GZIP; - } else if (compression == common::CompressionKind_ZSTD) { + } else if (compression == velox::common::CompressionKind_ZSTD) { return Compression::ZSTD; - } else if (compression == common::CompressionKind_NONE) { + } else if (compression == velox::common::CompressionKind_NONE) { return Compression::UNCOMPRESSED; - } else if (compression == common::CompressionKind_LZ4) { + } else if (compression == velox::common::CompressionKind_LZ4) { return Compression::LZ4_HADOOP; } else { VELOX_FAIL("Unsupported compression {}", compression); @@ -142,7 +142,7 @@ std::shared_ptr getArrowParquetWriterOptions( properties = properties->disable_dictionary(); } properties = properties->compression(getArrowParquetCompression( - options.compressionKind.value_or(common::CompressionKind_NONE))); + options.compressionKind.value_or(velox::common::CompressionKind_NONE))); for (const auto& columnCompressionValues : options.columnCompressionsMap) { properties->compression( columnCompressionValues.first, @@ -332,7 +332,7 @@ Writer::Writer( static_cast(options.parquetWriteTimestampUnit.value_or( TimestampPrecision::kNanoseconds)); options_.timestampTimeZone = options.parquetWriteTimestampTimeZone; - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::parquet::Writer::Writer", &options_); arrowContext_->properties = getArrowParquetWriterOptions(options, flushPolicy_); @@ -423,7 +423,7 @@ void Writer::write(const VectorPtr& data) { // Convert the arrow schema to Schema and then update the column names based // on schema_. auto arrowSchema = ::arrow::ImportSchema(&schema).ValueOrDie(); - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::parquet::Writer::write", arrowSchema.get()); std::vector> newFields; auto childSize = schema_->size(); @@ -459,7 +459,7 @@ void Writer::write(const VectorPtr& data) { arrowContext_->stagingBytes += bytes; } -bool Writer::isCodecAvailable(common::CompressionKind compression) { +bool Writer::isCodecAvailable(velox::common::CompressionKind compression) { return arrow::util::Codec::IsAvailable( getArrowParquetCompression(compression)); } diff --git a/velox/dwio/parquet/writer/Writer.h b/velox/dwio/parquet/writer/Writer.h index 460f01a7f240..7b4f35622293 100644 --- a/velox/dwio/parquet/writer/Writer.h +++ b/velox/dwio/parquet/writer/Writer.h @@ -96,7 +96,7 @@ struct WriterOptions : public dwio::common::WriterOptions { arrow::Encoding::type encoding = arrow::Encoding::PLAIN; std::shared_ptr codecOptions; - std::unordered_map + std::unordered_map columnCompressionsMap; /// Timestamp unit for Parquet write through Arrow bridge. @@ -168,7 +168,7 @@ class Writer : public dwio::common::Writer { ~Writer() override = default; - static bool isCodecAvailable(common::CompressionKind compression); + static bool isCodecAvailable(velox::common::CompressionKind compression); // Appends 'data' into the writer. void write(const VectorPtr& data) override; diff --git a/velox/examples/ScanAndSort.cpp b/velox/examples/ScanAndSort.cpp index 67aa80c67a32..82a84da03c61 100644 --- a/velox/examples/ScanAndSort.cpp +++ b/velox/examples/ScanAndSort.cpp @@ -79,25 +79,25 @@ int main(int argc, char** argv) { } // In order to read and write data and files from storage, we need to use a - // Connector. Let's instantiate and register a HiveConnector for this + // connector::common::Connector. Let's instantiate and register a HiveConnector for this // example: // We need a connector id string to identify the connector. const std::string kHiveConnectorId = "test-hive"; - // Register the Hive Connector Factory. - connector::registerConnectorFactory( + // Register the Hive connector::common::Connector Factory. + connector::common::registerConnectorFactory( std::make_shared()); // Create a new connector instance from the connector factory and register // it: auto hiveConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector( kHiveConnectorId, std::make_shared( std::unordered_map())); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); // To be able to read local files, we need to register the local file // filesystem. We also need to register the dwrf reader factory as well as a diff --git a/velox/exec/Driver.cpp b/velox/exec/Driver.cpp index 7a9d3654ae9a..cf7bee24beaf 100644 --- a/velox/exec/Driver.cpp +++ b/velox/exec/Driver.cpp @@ -106,7 +106,7 @@ velox::memory::MemoryPool* DriverCtx::addOperatorPool( planNodeId, splitGroupId, pipelineId, driverId, operatorType); } -std::optional DriverCtx::makeSpillConfig( +std::optional DriverCtx::makeSpillConfig( int32_t operatorId) const { const auto& queryConfig = task->queryCtx()->queryConfig(); if (!queryConfig.spillEnabled()) { @@ -115,17 +115,17 @@ std::optional DriverCtx::makeSpillConfig( if (task->spillDirectory().empty() && !task->hasCreateSpillDirectoryCb()) { return std::nullopt; } - common::GetSpillDirectoryPathCB getSpillDirPathCb = + velox::common::GetSpillDirectoryPathCB getSpillDirPathCb = [this]() -> std::string_view { return task->getOrCreateSpillDirectory(); }; const auto& spillFilePrefix = fmt::format("{}_{}_{}", pipelineId, driverId, operatorId); - common::UpdateAndCheckSpillLimitCB updateAndCheckSpillLimitCb = + velox::common::UpdateAndCheckSpillLimitCB updateAndCheckSpillLimitCb = [this](uint64_t bytes) { task->queryCtx()->updateSpilledBytesAndCheckLimit(bytes); }; - return common::SpillConfig( + return velox::common::SpillConfig( std::move(getSpillDirPathCb), std::move(updateAndCheckSpillLimitCb), spillFilePrefix, diff --git a/velox/exec/Driver.h b/velox/exec/Driver.h index a5ca6494cf50..30fc6a458681 100644 --- a/velox/exec/Driver.h +++ b/velox/exec/Driver.h @@ -302,10 +302,10 @@ struct DriverCtx { const std::string& operatorType); /// Builds the spill config for the operator with specified 'operatorId'. - std::optional makeSpillConfig(int32_t operatorId) const; + std::optional makeSpillConfig(int32_t operatorId) const; - common::PrefixSortConfig prefixSortConfig() const { - return common::PrefixSortConfig{ + velox::common::PrefixSortConfig prefixSortConfig() const { + return velox::common::PrefixSortConfig{ queryConfig().prefixSortNormalizedKeyMaxBytes(), queryConfig().prefixSortMinRows(), queryConfig().prefixSortMaxStringPrefixLength()}; diff --git a/velox/exec/Exchange.cpp b/velox/exec/Exchange.cpp index 8582c804124f..1d77df839283 100644 --- a/velox/exec/Exchange.cpp +++ b/velox/exec/Exchange.cpp @@ -29,7 +29,7 @@ std::unique_ptr getVectorSerdeOptions( ? std::make_unique() : std::make_unique(); options->compressionKind = - common::stringToCompressionKind(queryConfig.shuffleCompressionKind()); + velox::common::stringToCompressionKind(queryConfig.shuffleCompressionKind()); return options; } } // namespace diff --git a/velox/exec/Exchange.h b/velox/exec/Exchange.h index a6f7c261d50f..1ec308b93ea2 100644 --- a/velox/exec/Exchange.h +++ b/velox/exec/Exchange.h @@ -25,11 +25,11 @@ namespace facebook::velox::exec { -struct RemoteConnectorSplit : public connector::ConnectorSplit { +struct RemoteConnectorSplit : public connector::common::ConnectorSplit { const std::string taskId; explicit RemoteConnectorSplit(const std::string& remoteTaskId) - : ConnectorSplit(""), taskId(remoteTaskId) {} + : connector::common::ConnectorSplit(""), taskId(remoteTaskId) {} std::string toString() const override { return fmt::format("Remote: {}", taskId); diff --git a/velox/exec/GroupingSet.cpp b/velox/exec/GroupingSet.cpp index c9da8832986c..5982aed548b0 100644 --- a/velox/exec/GroupingSet.cpp +++ b/velox/exec/GroupingSet.cpp @@ -51,10 +51,10 @@ GroupingSet::GroupingSet( bool isRawInput, const std::vector& globalGroupingSets, const std::optional& groupIdChannel, - const common::SpillConfig* spillConfig, + const velox::common::SpillConfig* spillConfig, tsan_atomic* nonReclaimableSection, OperatorCtx* operatorCtx, - folly::Synchronized* spillStats) + folly::Synchronized* spillStats) : preGroupedKeyChannels_(std::move(preGroupedKeys)), groupingKeyOutputProjections_(std::move(groupingKeyOutputProjections)), hashers_(std::move(hashers)), @@ -990,7 +990,7 @@ RowTypePtr GroupingSet::makeSpillType() const { return ROW(std::move(names), std::move(types)); } -std::optional GroupingSet::spilledStats() const { +std::optional GroupingSet::spilledStats() const { if (!hasSpilled()) { return std::nullopt; } @@ -1523,8 +1523,8 @@ AggregationInputSpiller::AggregationInputSpiller( RowTypePtr rowType, const HashBitRange& hashBitRange, const std::vector& sortingKeys, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats) + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats) : SpillerBase( container, std::move(rowType), @@ -1539,8 +1539,8 @@ AggregationInputSpiller::AggregationInputSpiller( AggregationOutputSpiller::AggregationOutputSpiller( RowContainer* container, RowTypePtr rowType, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats) + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats) : SpillerBase( container, std::move(rowType), diff --git a/velox/exec/GroupingSet.h b/velox/exec/GroupingSet.h index a599fb9b628d..a0fcc82284fa 100644 --- a/velox/exec/GroupingSet.h +++ b/velox/exec/GroupingSet.h @@ -41,10 +41,10 @@ class GroupingSet { bool isRawInput, const std::vector& globalGroupingSets, const std::optional& groupIdChannel, - const common::SpillConfig* spillConfig, + const velox::common::SpillConfig* spillConfig, tsan_atomic* nonReclaimableSection, OperatorCtx* operatorCtx, - folly::Synchronized* spillStats); + folly::Synchronized* spillStats); ~GroupingSet(); @@ -117,7 +117,7 @@ class GroupingSet { void spill(const RowContainerIterator& rowIterator); /// Returns the spiller stats including total bytes and rows spilled so far. - std::optional spilledStats() const; + std::optional spilledStats() const; /// Returns true if spilling has triggered on this grouping set. bool hasSpilled() const; @@ -318,7 +318,7 @@ class GroupingSet { // Column for groupId for a GROUPING SET. std::optional groupIdChannel_; - const common::SpillConfig* const spillConfig_; + const velox::common::SpillConfig* const spillConfig_; // Indicates if this grouping set and the associated hash aggregation operator // is under non-reclaimable execution section or not. @@ -410,7 +410,7 @@ class GroupingSet { // state of aggregate for all rows. std::vector firstGroup_; - folly::Synchronized* const spillStats_; + folly::Synchronized* const spillStats_; }; class AggregationInputSpiller : public SpillerBase { @@ -422,8 +422,8 @@ class AggregationInputSpiller : public SpillerBase { RowTypePtr rowType, const HashBitRange& hashBitRange, const std::vector& sortingKeys, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats); + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats); void spill(); @@ -444,8 +444,8 @@ class AggregationOutputSpiller : public SpillerBase { AggregationOutputSpiller( RowContainer* container, RowTypePtr rowType, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats); + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats); void spill(const RowContainerIterator& startRowIter); diff --git a/velox/exec/HashBuild.cpp b/velox/exec/HashBuild.cpp index 4416549c8d33..046c64949bc0 100644 --- a/velox/exec/HashBuild.cpp +++ b/velox/exec/HashBuild.cpp @@ -1183,8 +1183,8 @@ HashBuildSpiller::HashBuildSpiller( RowContainer* container, RowTypePtr rowType, HashBitRange bits, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats) + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats) : SpillerBase( container, std::move(rowType), diff --git a/velox/exec/HashBuild.h b/velox/exec/HashBuild.h index b2df3dc4a283..ec80802a199d 100644 --- a/velox/exec/HashBuild.h +++ b/velox/exec/HashBuild.h @@ -329,8 +329,8 @@ class HashBuildSpiller : public SpillerBase { RowContainer* container, RowTypePtr rowType, HashBitRange bits, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats); + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats); /// Invoked to spill all the rows stored in the row container of the hash /// build. diff --git a/velox/exec/HashJoinBridge.cpp b/velox/exec/HashJoinBridge.cpp index 7c359c924c3e..2e31032e773b 100644 --- a/velox/exec/HashJoinBridge.cpp +++ b/velox/exec/HashJoinBridge.cpp @@ -98,8 +98,8 @@ std::unique_ptr createSpiller( core::JoinType joinType, const RowTypePtr& tableType, const HashBitRange& hashBitRange, - const common::SpillConfig* spillConfig, - folly::Synchronized* stats) { + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* stats) { return std::make_unique( joinType, parentId, @@ -113,7 +113,7 @@ std::unique_ptr createSpiller( std::vector> spillHashJoinTable( const std::vector& spillers, - const common::SpillConfig* spillConfig) { + const velox::common::SpillConfig* spillConfig) { VELOX_CHECK_NOT_NULL(spillConfig); auto spillExecutor = spillConfig->executor; std::vector>> @@ -166,8 +166,8 @@ SpillPartitionSet spillHashJoinTable( std::optional parentId, const HashBitRange& hashBitRange, const std::shared_ptr& joinNode, - const common::SpillConfig* spillConfig, - folly::Synchronized* stats) { + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* stats) { VELOX_CHECK_NOT_NULL(table); VELOX_CHECK_NOT_NULL(spillConfig); if (table->numDistinct() == 0) { diff --git a/velox/exec/HashJoinBridge.h b/velox/exec/HashJoinBridge.h index 879eab6801f8..e8409945e74a 100644 --- a/velox/exec/HashJoinBridge.h +++ b/velox/exec/HashJoinBridge.h @@ -243,7 +243,7 @@ struct HashJoinTableSpillResult { /// a partially built hash join table. std::vector> spillHashJoinTable( const std::vector& spillers, - const common::SpillConfig* spillConfig); + const velox::common::SpillConfig* spillConfig); /// Invoked to spill 'table' and returns spilled partitions. This is used by /// hash probe or hash join bridge to spill a fully built table. @@ -252,8 +252,8 @@ SpillPartitionSet spillHashJoinTable( std::optional parentId, const HashBitRange& hashBitRange, const std::shared_ptr& joinNode, - const common::SpillConfig* spillConfig, - folly::Synchronized* stats); + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* stats); /// Returns the type used to spill a given hash table type. The function /// might attach a boolean column at the end of 'tableType' if 'joinType' needs diff --git a/velox/exec/IndexLookupJoin.cpp b/velox/exec/IndexLookupJoin.cpp index f4b5e881c0da..75fff9c148ad 100644 --- a/velox/exec/IndexLookupJoin.cpp +++ b/velox/exec/IndexLookupJoin.cpp @@ -15,8 +15,8 @@ */ #include "velox/exec/IndexLookupJoin.h" +#include "velox/connectors/common/Connector.h" #include "velox/buffer/Buffer.h" -#include "velox/connectors/Connector.h" #include "velox/exec/Task.h" #include "velox/expression/Expr.h" #include "velox/expression/FieldReference.h" @@ -158,7 +158,7 @@ IndexLookupJoin::IndexLookupJoin( operatorType(), lookupTableHandle_->connectorId()), spillConfig_.has_value() ? &(spillConfig_.value()) : nullptr)}, - connector_(connector::getConnector(lookupTableHandle_->connectorId())), + connector_(connector::common::getConnector(lookupTableHandle_->connectorId())), maxNumInputBatches_( 1 + driverCtx->queryConfig().indexLookupJoinMaxPrefetchBatches()), joinNode_{joinNode} { @@ -474,7 +474,7 @@ void IndexLookupJoin::startLookup(InputBatchState& batch) { VELOX_CHECK(!batch.lookupFuture.valid()); batch.lookupResultIter = indexSource_->lookup( - connector::IndexSource::LookupRequest{batch.lookupInput}); + connector::common::IndexSource::LookupRequest{batch.lookupInput}); auto lookupResultOr = batch.lookupResultIter->next(outputBatchSize_, batch.lookupFuture); if (!lookupResultOr.has_value()) { diff --git a/velox/exec/IndexLookupJoin.h b/velox/exec/IndexLookupJoin.h index 142e2c62ea9a..123a114e22ae 100644 --- a/velox/exec/IndexLookupJoin.h +++ b/velox/exec/IndexLookupJoin.h @@ -79,8 +79,8 @@ class IndexLookupJoin : public Operator { "clientLookupResultSize"}; private: - using LookupResultIter = connector::IndexSource::LookupResultIterator; - using LookupResult = connector::IndexSource::LookupResult; + using LookupResultIter = connector::common::IndexSource::LookupResultIterator; + using LookupResult = connector::common::IndexSource::LookupResult; // Contains the state of an input batch processing. struct InputBatchState { @@ -193,14 +193,14 @@ class IndexLookupJoin : public Operator { const size_t numKeys_; const RowTypePtr probeType_; const RowTypePtr lookupType_; - const std::shared_ptr lookupTableHandle_; + const std::shared_ptr lookupTableHandle_; const std::vector lookupConditions_; std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> lookupColumnHandles_; - const std::shared_ptr connectorQueryCtx_; - const std::shared_ptr connector_; + const std::shared_ptr connectorQueryCtx_; + const std::shared_ptr connector_; const size_t maxNumInputBatches_; // The lookup join plan node used to initialize this operator and reset after @@ -230,7 +230,7 @@ class IndexLookupJoin : public Operator { std::vector probeOutputProjections_; std::vector lookupOutputProjections_; - std::shared_ptr indexSource_; + std::shared_ptr indexSource_; // Points to the next output row in 'lookupResult_' for processing until // reaches to the end of 'lookupResult_'. diff --git a/velox/exec/Merge.cpp b/velox/exec/Merge.cpp index 51b46fdcaca7..4c2494cbf98b 100644 --- a/velox/exec/Merge.cpp +++ b/velox/exec/Merge.cpp @@ -30,7 +30,7 @@ std::unique_ptr getVectorSerdeOptions( ? std::make_unique() : std::make_unique(); options->compressionKind = - common::stringToCompressionKind(queryConfig.shuffleCompressionKind()); + velox::common::stringToCompressionKind(queryConfig.shuffleCompressionKind()); return options; } } // namespace diff --git a/velox/exec/MergeSource.cpp b/velox/exec/MergeSource.cpp index cc9a4a4be834..e6e538a57602 100644 --- a/velox/exec/MergeSource.cpp +++ b/velox/exec/MergeSource.cpp @@ -314,7 +314,7 @@ BlockingReason MergeJoinSource::next( RowVectorPtr* data, bool& drained) { drained = false; - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::exec::MergeJoinSource::next", this); ScopedPromiseNotification notification(1); return state_.withWLock([&](auto& state) { @@ -347,7 +347,7 @@ BlockingReason MergeJoinSource::next( BlockingReason MergeJoinSource::enqueue( RowVectorPtr data, ContinueFuture* future) { - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::exec::MergeJoinSource::enqueue", this); ScopedPromiseNotification notification(1); return state_.withWLock([&](auto& state) { diff --git a/velox/exec/Operator.cpp b/velox/exec/Operator.cpp index 29bfbef7cb5e..f50ad8b340d4 100644 --- a/velox/exec/Operator.cpp +++ b/velox/exec/Operator.cpp @@ -46,14 +46,14 @@ core::ExecCtx* OperatorCtx::execCtx() const { return execCtx_.get(); } -std::shared_ptr +std::shared_ptr OperatorCtx::createConnectorQueryCtx( const std::string& connectorId, const std::string& planNodeId, memory::MemoryPool* connectorPool, - const common::SpillConfig* spillConfig) const { + const velox::common::SpillConfig* spillConfig) const { const auto& task = driverCtx_->task; - auto connectorQueryCtx = std::make_shared( + auto connectorQueryCtx = std::make_shared( pool_, connectorPool, task->queryCtx()->connectorSessionProperties(connectorId), @@ -80,7 +80,7 @@ Operator::Operator( int32_t operatorId, std::string planNodeId, std::string operatorType, - std::optional spillConfig) + std::optional spillConfig) : operatorCtx_(std::make_unique( driverCtx, planNodeId, @@ -445,7 +445,7 @@ void Operator::recordSpillStats() { lockedStats->addRuntimeStat( kSpillRuns, RuntimeCounter{static_cast(lockedSpillStats->spillRuns)}); - common::updateGlobalSpillRunStats(lockedSpillStats->spillRuns); + velox::common::updateGlobalSpillRunStats(lockedSpillStats->spillRuns); } if (lockedSpillStats->spillMaxLevelExceededCount != 0) { @@ -453,7 +453,7 @@ void Operator::recordSpillStats() { kExceededMaxSpillLevel, RuntimeCounter{static_cast( lockedSpillStats->spillMaxLevelExceededCount)}); - common::updateGlobalMaxSpillLevelExceededCount( + velox::common::updateGlobalMaxSpillLevelExceededCount( lockedSpillStats->spillMaxLevelExceededCount); } diff --git a/velox/exec/Operator.h b/velox/exec/Operator.h index fd3bb3234ac5..f185b4abd4ca 100644 --- a/velox/exec/Operator.h +++ b/velox/exec/Operator.h @@ -87,11 +87,11 @@ class OperatorCtx { /// is the id of the calling TableScan. This and the task id identify the scan /// for column access tracking. 'connectorPool' is an aggregate memory pool /// for connector use. - std::shared_ptr createConnectorQueryCtx( + std::shared_ptr createConnectorQueryCtx( const std::string& connectorId, const std::string& planNodeId, memory::MemoryPool* connectorPool, - const common::SpillConfig* spillConfig = nullptr) const; + const velox::common::SpillConfig* spillConfig = nullptr) const; private: DriverCtx* const driverCtx_; @@ -204,7 +204,7 @@ class Operator : public BaseRuntimeStatWriter { int32_t operatorId, std::string planNodeId, std::string operatorType, - std::optional spillConfig = std::nullopt); + std::optional spillConfig = std::nullopt); virtual ~Operator() = default; @@ -294,7 +294,7 @@ class Operator : public BaseRuntimeStatWriter { /// hash join into probe-side table scan. Can also be used to push down TopN /// cutoff. virtual const std:: - unordered_map>& + unordered_map>& getDynamicFilters() const { return dynamicFilters_; } @@ -316,7 +316,7 @@ class Operator : public BaseRuntimeStatWriter { virtual void addDynamicFilter( const core::PlanNodeId& /*producer*/, column_index_t /*outputChannel*/, - const std::shared_ptr& /*filter*/) { + const std::shared_ptr& /*filter*/) { VELOX_UNSUPPORTED( "This operator doesn't support dynamic filter pushdown: {}", toString()); @@ -579,7 +579,7 @@ class Operator : public BaseRuntimeStatWriter { return spillConfig_.has_value(); } - const common::SpillConfig* spillConfig() const { + const velox::common::SpillConfig* spillConfig() const { return spillConfig_.has_value() ? &spillConfig_.value() : nullptr; } @@ -630,12 +630,12 @@ class Operator : public BaseRuntimeStatWriter { const RowTypePtr outputType_; /// Contains the disk spilling related configs if spilling is enabled (e.g. /// the fs dir path to store spill files), otherwise null. - const std::optional spillConfig_; + const std::optional spillConfig_; bool initialized_{false}; folly::Synchronized stats_; - folly::Synchronized spillStats_; + folly::Synchronized spillStats_; /// NOTE: only one of the two could be set for an operator for tracing . /// 'splitTracer_' is only set for table scan to record the processed split @@ -664,7 +664,7 @@ class Operator : public BaseRuntimeStatWriter { /// could copy directly from input to output if no cardinality change. bool isIdentityProjection_ = false; - std::unordered_map> + std::unordered_map> dynamicFilters_; private: diff --git a/velox/exec/OperatorTraceReader.cpp b/velox/exec/OperatorTraceReader.cpp index 2e92b6c40711..ffca6c81bdff 100644 --- a/velox/exec/OperatorTraceReader.cpp +++ b/velox/exec/OperatorTraceReader.cpp @@ -132,7 +132,7 @@ OperatorTraceSplitReader::getSplitInputStream( // static std::vector OperatorTraceSplitReader::deserialize( - common::FileInputStream* stream) { + velox::common::FileInputStream* stream) { std::vector splits; try { while (!stream->atEnd()) { diff --git a/velox/exec/OperatorTraceReader.h b/velox/exec/OperatorTraceReader.h index 6a8e7c656508..c19f151073a9 100644 --- a/velox/exec/OperatorTraceReader.h +++ b/velox/exec/OperatorTraceReader.h @@ -42,7 +42,7 @@ class OperatorTraceInputReader { const std::string traceDir_; const serializer::presto::PrestoVectorSerde::PrestoOptions readOptions_{ true, - common::CompressionKind_ZSTD, // TODO: Use trace config. + velox::common::CompressionKind_ZSTD, // TODO: Use trace config. 0.8, /*_nullsFirst=*/true}; const std::shared_ptr fs_; @@ -86,7 +86,7 @@ class OperatorTraceSplitReader { std::vector read() const; private: - static std::vector deserialize(common::FileInputStream* stream); + static std::vector deserialize(velox::common::FileInputStream* stream); std::unique_ptr getSplitInputStream( const std::string& traceDir) const; diff --git a/velox/exec/OperatorTraceWriter.h b/velox/exec/OperatorTraceWriter.h index 437f43daba0b..9234e219b672 100644 --- a/velox/exec/OperatorTraceWriter.h +++ b/velox/exec/OperatorTraceWriter.h @@ -57,7 +57,7 @@ class OperatorTraceInputWriter { // TODO: make 'useLosslessTimestamp' configuerable. const serializer::presto::PrestoVectorSerde::PrestoOptions options_ = { true, - common::CompressionKind::CompressionKind_ZSTD, + velox::common::CompressionKind::CompressionKind_ZSTD, 0.8, /*nullsFirst=*/true}; const std::shared_ptr fs_; diff --git a/velox/exec/OutputBuffer.cpp b/velox/exec/OutputBuffer.cpp index eb00d51852f4..a61ae3f849f9 100644 --- a/velox/exec/OutputBuffer.cpp +++ b/velox/exec/OutputBuffer.cpp @@ -476,7 +476,7 @@ bool OutputBuffer::enqueue( } if (bufferedBytes_ >= maxSize_ && future) { - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::exec::OutputBuffer::enqueue", this); promises_.emplace_back("OutputBuffer::enqueue"); @@ -602,7 +602,7 @@ void OutputBuffer::checkIfDone(bool oneDriverFinished) { } } - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::exec::OutputBuffer::checkIfDone", this); } } diff --git a/velox/exec/PartitionStreamingWindowBuild.cpp b/velox/exec/PartitionStreamingWindowBuild.cpp index 331d3e6f6e1e..41df195119fc 100644 --- a/velox/exec/PartitionStreamingWindowBuild.cpp +++ b/velox/exec/PartitionStreamingWindowBuild.cpp @@ -21,7 +21,7 @@ namespace facebook::velox::exec { PartitionStreamingWindowBuild::PartitionStreamingWindowBuild( const std::shared_ptr& windowNode, velox::memory::MemoryPool* pool, - const common::SpillConfig* spillConfig, + const velox::common::SpillConfig* spillConfig, tsan_atomic* nonReclaimableSection) : WindowBuild(windowNode, pool, spillConfig, nonReclaimableSection) {} diff --git a/velox/exec/PartitionStreamingWindowBuild.h b/velox/exec/PartitionStreamingWindowBuild.h index bb5cb352d24f..48cadd135e44 100644 --- a/velox/exec/PartitionStreamingWindowBuild.h +++ b/velox/exec/PartitionStreamingWindowBuild.h @@ -29,7 +29,7 @@ class PartitionStreamingWindowBuild : public WindowBuild { PartitionStreamingWindowBuild( const std::shared_ptr& windowNode, velox::memory::MemoryPool* pool, - const common::SpillConfig* spillConfig, + const velox::common::SpillConfig* spillConfig, tsan_atomic* nonReclaimableSection); void addInput(RowVectorPtr input) override; @@ -38,7 +38,7 @@ class PartitionStreamingWindowBuild : public WindowBuild { VELOX_UNREACHABLE(); } - std::optional spilledStats() const override { + std::optional spilledStats() const override { return std::nullopt; } diff --git a/velox/exec/PartitionedOutput.cpp b/velox/exec/PartitionedOutput.cpp index 28878ef21d15..3622d8e0df91 100644 --- a/velox/exec/PartitionedOutput.cpp +++ b/velox/exec/PartitionedOutput.cpp @@ -28,7 +28,7 @@ std::unique_ptr getVectorSerdeOptions( ? std::make_unique() : std::make_unique(); options->compressionKind = - common::stringToCompressionKind(queryConfig.shuffleCompressionKind()); + velox::common::stringToCompressionKind(queryConfig.shuffleCompressionKind()); options->minCompressionRatio = PartitionedOutput::minCompressionRatio(); return options; } diff --git a/velox/exec/RowNumber.cpp b/velox/exec/RowNumber.cpp index 31421d0d29b5..a5d5a4a83316 100644 --- a/velox/exec/RowNumber.cpp +++ b/velox/exec/RowNumber.cpp @@ -543,8 +543,8 @@ RowNumberHashTableSpiller::RowNumberHashTableSpiller( std::optional parentId, RowTypePtr rowType, HashBitRange bits, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats) + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats) : SpillerBase( container, std::move(rowType), diff --git a/velox/exec/RowNumber.h b/velox/exec/RowNumber.h index 255827ece03e..f5d9cb928c09 100644 --- a/velox/exec/RowNumber.h +++ b/velox/exec/RowNumber.h @@ -160,8 +160,8 @@ class RowNumberHashTableSpiller : public SpillerBase { std::optional parentId, RowTypePtr rowType, HashBitRange bits, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats); + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats); void spill(); diff --git a/velox/exec/RowsStreamingWindowBuild.cpp b/velox/exec/RowsStreamingWindowBuild.cpp index a3dedd8a60cd..195cea788e26 100644 --- a/velox/exec/RowsStreamingWindowBuild.cpp +++ b/velox/exec/RowsStreamingWindowBuild.cpp @@ -34,7 +34,7 @@ bool hasRangeFrame(const std::shared_ptr& windowNode) { RowsStreamingWindowBuild::RowsStreamingWindowBuild( const std::shared_ptr& windowNode, velox::memory::MemoryPool* pool, - const common::SpillConfig* spillConfig, + const velox::common::SpillConfig* spillConfig, tsan_atomic* nonReclaimableSection) : WindowBuild(windowNode, pool, spillConfig, nonReclaimableSection), hasRangeFrame_(hasRangeFrame(windowNode)) { diff --git a/velox/exec/RowsStreamingWindowBuild.h b/velox/exec/RowsStreamingWindowBuild.h index 065ff4c7c44c..a224d18c710f 100644 --- a/velox/exec/RowsStreamingWindowBuild.h +++ b/velox/exec/RowsStreamingWindowBuild.h @@ -32,7 +32,7 @@ class RowsStreamingWindowBuild : public WindowBuild { RowsStreamingWindowBuild( const std::shared_ptr& windowNode, velox::memory::MemoryPool* pool, - const common::SpillConfig* spillConfig, + const velox::common::SpillConfig* spillConfig, tsan_atomic* nonReclaimableSection); void addInput(RowVectorPtr input) override; @@ -41,7 +41,7 @@ class RowsStreamingWindowBuild : public WindowBuild { VELOX_UNREACHABLE(); } - std::optional spilledStats() const override { + std::optional spilledStats() const override { return std::nullopt; } diff --git a/velox/exec/SortBuffer.cpp b/velox/exec/SortBuffer.cpp index d64f858d95f1..2d35c57d5bb2 100644 --- a/velox/exec/SortBuffer.cpp +++ b/velox/exec/SortBuffer.cpp @@ -26,8 +26,8 @@ SortBuffer::SortBuffer( const std::vector& sortCompareFlags, velox::memory::MemoryPool* pool, tsan_atomic* nonReclaimableSection, - common::PrefixSortConfig prefixSortConfig, - const common::SpillConfig* spillConfig, + velox::common::PrefixSortConfig prefixSortConfig, + const velox::common::SpillConfig* spillConfig, folly::Synchronized* spillStats) : input_(input), sortCompareFlags_(sortCompareFlags), diff --git a/velox/exec/SortBuffer.h b/velox/exec/SortBuffer.h index 9804e2f85890..02c96057ee2e 100644 --- a/velox/exec/SortBuffer.h +++ b/velox/exec/SortBuffer.h @@ -38,8 +38,8 @@ class SortBuffer { const std::vector& sortCompareFlags, velox::memory::MemoryPool* pool, tsan_atomic* nonReclaimableSection, - common::PrefixSortConfig prefixSortConfig, - const common::SpillConfig* spillConfig = nullptr, + velox::common::PrefixSortConfig prefixSortConfig, + const velox::common::SpillConfig* spillConfig = nullptr, folly::Synchronized* spillStats = nullptr); ~SortBuffer(); @@ -121,11 +121,11 @@ class SortBuffer { tsan_atomic* const nonReclaimableSection_; // Configuration settings for prefix-sort. - const common::PrefixSortConfig prefixSortConfig_; + const velox::common::PrefixSortConfig prefixSortConfig_; - const common::SpillConfig* const spillConfig_; + const velox::common::SpillConfig* const spillConfig_; - folly::Synchronized* const spillStats_; + folly::Synchronized* const spillStats_; // The column projection map between 'input_' and 'spillerStoreType_' as sort // buffer stores the sort columns first in 'data_'. diff --git a/velox/exec/SortWindowBuild.cpp b/velox/exec/SortWindowBuild.cpp index f25175cc2cfa..13537f8f0b4e 100644 --- a/velox/exec/SortWindowBuild.cpp +++ b/velox/exec/SortWindowBuild.cpp @@ -42,10 +42,10 @@ std::vector makeCompareFlags( SortWindowBuild::SortWindowBuild( const std::shared_ptr& node, velox::memory::MemoryPool* pool, - common::PrefixSortConfig&& prefixSortConfig, - const common::SpillConfig* spillConfig, + velox::common::PrefixSortConfig&& prefixSortConfig, + const velox::common::SpillConfig* spillConfig, tsan_atomic* nonReclaimableSection, - folly::Synchronized* spillStats) + folly::Synchronized* spillStats) : WindowBuild(node, pool, spillConfig, nonReclaimableSection), numPartitionKeys_{node->partitionKeys().size()}, compareFlags_{makeCompareFlags(numPartitionKeys_, node->sortingOrders())}, @@ -194,7 +194,7 @@ void SortWindowBuild::spill() { data_->pool()->release(); } -std::optional SortWindowBuild::spilledStats() const { +std::optional SortWindowBuild::spilledStats() const { if (spiller_ == nullptr) { return std::nullopt; } diff --git a/velox/exec/SortWindowBuild.h b/velox/exec/SortWindowBuild.h index 72875094007a..c9d01ffc1781 100644 --- a/velox/exec/SortWindowBuild.h +++ b/velox/exec/SortWindowBuild.h @@ -29,10 +29,10 @@ class SortWindowBuild : public WindowBuild { SortWindowBuild( const std::shared_ptr& node, velox::memory::MemoryPool* pool, - common::PrefixSortConfig&& prefixSortConfig, - const common::SpillConfig* spillConfig, + velox::common::PrefixSortConfig&& prefixSortConfig, + const velox::common::SpillConfig* spillConfig, tsan_atomic* nonReclaimableSection, - folly::Synchronized* spillStats); + folly::Synchronized* spillStats); ~SortWindowBuild() override { pool_->release(); @@ -47,7 +47,7 @@ class SortWindowBuild : public WindowBuild { void spill() override; - std::optional spilledStats() const override; + std::optional spilledStats() const override; void noMoreInput() override; @@ -90,9 +90,9 @@ class SortWindowBuild : public WindowBuild { memory::MemoryPool* const pool_; // Config for Prefix-sort. - const common::PrefixSortConfig prefixSortConfig_; + const velox::common::PrefixSortConfig prefixSortConfig_; - folly::Synchronized* const spillStats_; + folly::Synchronized* const spillStats_; // allKeyInfo_ is a combination of (partitionKeyInfo_ and sortKeyInfo_). // It is used to perform a full sorting of the input rows to be able to diff --git a/velox/exec/Spill.cpp b/velox/exec/Spill.cpp index 9b9f14eb75d6..7e5b6c81314f 100644 --- a/velox/exec/Spill.cpp +++ b/velox/exec/Spill.cpp @@ -61,16 +61,16 @@ void SpillMergeStream::close() { } SpillState::SpillState( - const common::GetSpillDirectoryPathCB& getSpillDirPathCb, - const common::UpdateAndCheckSpillLimitCB& updateAndCheckSpillLimitCb, + const velox::common::GetSpillDirectoryPathCB& getSpillDirPathCb, + const velox::common::UpdateAndCheckSpillLimitCB& updateAndCheckSpillLimitCb, const std::string& fileNamePrefix, const std::vector& sortingKeys, uint64_t targetFileSize, uint64_t writeBufferSize, - common::CompressionKind compressionKind, + velox::common::CompressionKind compressionKind, const std::optional& prefixSortConfig, memory::MemoryPool* pool, - folly::Synchronized* stats, + folly::Synchronized* stats, const std::string& fileCreateConfig) : getSpillDirPathCb_(getSpillDirPathCb), updateAndCheckSpillLimitCb_(updateAndCheckSpillLimitCb), @@ -111,7 +111,7 @@ void SpillState::setPartitionSpilled(const SpillPartitionId& id) { VELOX_DCHECK(!spilledPartitionIdSet_.contains(id)); spilledPartitionIdSet_.emplace(id); ++stats_->wlock()->spilledPartitions; - common::incrementGlobalSpilledPartitionStats(); + velox::common::incrementGlobalSpilledPartitionStats(); } /*static*/ @@ -129,7 +129,7 @@ void SpillState::validateSpillBytesSize(uint64_t bytes) { void SpillState::updateSpilledInputBytes(uint64_t bytes) { auto statsLocked = stats_->wlock(); statsLocked->spilledInputBytes += bytes; - common::updateGlobalSpillMemoryBytes(bytes); + velox::common::updateGlobalSpillMemoryBytes(bytes); } uint64_t SpillState::appendToPartition( @@ -278,7 +278,7 @@ std::unique_ptr> SpillPartition::createUnorderedReader( uint64_t bufferSize, memory::MemoryPool* pool, - folly::Synchronized* spillStats) { + folly::Synchronized* spillStats) { VELOX_CHECK_NOT_NULL(pool); std::vector> streams; streams.reserve(files_.size()); @@ -295,7 +295,7 @@ std::unique_ptr> SpillPartition::createOrderedReader( uint64_t bufferSize, memory::MemoryPool* pool, - folly::Synchronized* spillStats) { + folly::Synchronized* spillStats) { std::vector> streams; streams.reserve(files_.size()); for (auto& fileInfo : files_) { diff --git a/velox/exec/Spill.h b/velox/exec/Spill.h index ec7605af9593..ffb6e9be835b 100644 --- a/velox/exec/Spill.h +++ b/velox/exec/Spill.h @@ -457,7 +457,7 @@ class SpillPartition { std::unique_ptr> createUnorderedReader( uint64_t bufferSize, memory::MemoryPool* pool, - folly::Synchronized* spillStats); + folly::Synchronized* spillStats); /// Invoked to create an ordered stream reader from this spill partition. /// The created reader will take the ownership of the spill files. @@ -468,7 +468,7 @@ class SpillPartition { std::unique_ptr> createOrderedReader( uint64_t bufferSize, memory::MemoryPool* pool, - folly::Synchronized* spillStats); + folly::Synchronized* spillStats); std::string toString() const; @@ -531,16 +531,16 @@ class SpillState { /// target size of a single file. 'pool' owns the memory for state and /// results. SpillState( - const common::GetSpillDirectoryPathCB& getSpillDirectoryPath, - const common::UpdateAndCheckSpillLimitCB& updateAndCheckSpillLimitCb, + const velox::common::GetSpillDirectoryPathCB& getSpillDirectoryPath, + const velox::common::UpdateAndCheckSpillLimitCB& updateAndCheckSpillLimitCb, const std::string& fileNamePrefix, const std::vector& sortingKeys, uint64_t targetFileSize, uint64_t writeBufferSize, - common::CompressionKind compressionKind, + velox::common::CompressionKind compressionKind, const std::optional& prefixSortConfig, memory::MemoryPool* pool, - folly::Synchronized* stats, + folly::Synchronized* stats, const std::string& fileCreateConfig = {}); static std::vector makeSortingKeys( @@ -562,7 +562,7 @@ class SpillState { return targetFileSize_; } - common::CompressionKind compressionKind() const { + velox::common::CompressionKind compressionKind() const { return compressionKind_; } @@ -630,22 +630,22 @@ class SpillState { // A callback function that returns the spill directory path. // Implementations can use it to ensure the path exists before returning. - common::GetSpillDirectoryPathCB getSpillDirPathCb_; + velox::common::GetSpillDirectoryPathCB getSpillDirPathCb_; // Updates the aggregated spill bytes of this query, and throws if exceeds // the max spill bytes limit. - common::UpdateAndCheckSpillLimitCB updateAndCheckSpillLimitCb_; + velox::common::UpdateAndCheckSpillLimitCB updateAndCheckSpillLimitCb_; // Prefix for spill files. const std::string fileNamePrefix_; const std::vector sortingKeys_; const uint64_t targetFileSize_; const uint64_t writeBufferSize_; - const common::CompressionKind compressionKind_; + const velox::common::CompressionKind compressionKind_; const std::optional prefixSortConfig_; const std::string fileCreateConfig_; memory::MemoryPool* const pool_; - folly::Synchronized* const stats_; + folly::Synchronized* const stats_; // A set of spilled partition ids. SpillPartitionIdSet spilledPartitionIdSet_; diff --git a/velox/exec/SpillFile.cpp b/velox/exec/SpillFile.cpp index 0e612af59d40..6f95b19e08fc 100644 --- a/velox/exec/SpillFile.cpp +++ b/velox/exec/SpillFile.cpp @@ -77,9 +77,9 @@ SpillWriterBase::SpillWriterBase( uint64_t targetFileSize, const std::string& pathPrefix, const std::string& fileCreateConfig, - common::UpdateAndCheckSpillLimitCB& updateAndCheckSpillLimitCb, + velox::common::UpdateAndCheckSpillLimitCB& updateAndCheckSpillLimitCb, memory::MemoryPool* pool, - folly::Synchronized* stats) + folly::Synchronized* stats) : pool_(pool), stats_(stats), updateAndCheckSpillLimitCb_(updateAndCheckSpillLimitCb), @@ -170,14 +170,14 @@ void SpillWriterBase::updateWriteStats( statsLocked->spillFlushTimeNanos += flushTimeNs; statsLocked->spillWriteTimeNanos += writeTimeNs; ++statsLocked->spillWrites; - common::updateGlobalSpillWriteStats(spilledBytes, flushTimeNs, writeTimeNs); + velox::common::updateGlobalSpillWriteStats(spilledBytes, flushTimeNs, writeTimeNs); } void SpillWriterBase::updateSpilledFileStats(uint64_t fileSize) { ++stats_->wlock()->spilledFiles; addThreadLocalRuntimeStat( "spillFileSize", RuntimeCounter(fileSize, RuntimeCounter::Unit::kBytes)); - common::incrementGlobalSpilledFiles(); + velox::common::incrementGlobalSpilledFiles(); } void SpillWriterBase::updateAppendStats( @@ -186,20 +186,20 @@ void SpillWriterBase::updateAppendStats( auto statsLocked = stats_->wlock(); statsLocked->spilledRows += numRows; statsLocked->spillSerializationTimeNanos += serializationTimeNs; - common::updateGlobalSpillAppendStats(numRows, serializationTimeNs); + velox::common::updateGlobalSpillAppendStats(numRows, serializationTimeNs); } SpillWriter::SpillWriter( const RowTypePtr& type, const std::vector& sortingKeys, - common::CompressionKind compressionKind, + velox::common::CompressionKind compressionKind, const std::string& pathPrefix, uint64_t targetFileSize, uint64_t writeBufferSize, const std::string& fileCreateConfig, - common::UpdateAndCheckSpillLimitCB& updateAndCheckSpillLimitCb, + velox::common::UpdateAndCheckSpillLimitCB& updateAndCheckSpillLimitCb, memory::MemoryPool* pool, - folly::Synchronized* stats) + folly::Synchronized* stats) : SpillWriterBase( writeBufferSize, targetFileSize, @@ -294,7 +294,7 @@ std::unique_ptr SpillReadFile::create( const SpillFileInfo& fileInfo, uint64_t bufferSize, memory::MemoryPool* pool, - folly::Synchronized* stats) { + folly::Synchronized* stats) { return std::unique_ptr(new SpillReadFile( fileInfo.id, fileInfo.path, @@ -314,9 +314,9 @@ SpillReadFile::SpillReadFile( uint64_t bufferSize, const RowTypePtr& type, const std::vector& sortingKeys, - common::CompressionKind compressionKind, + velox::common::CompressionKind compressionKind, memory::MemoryPool* pool, - folly::Synchronized* stats) + folly::Synchronized* stats) : id_(id), path_(path), size_(size), @@ -350,14 +350,14 @@ bool SpillReadFile::nextBatch(RowVectorPtr& rowVector) { input_.get(), pool_, type_, serde_, &rowVector, &readOptions_); } stats_->wlock()->spillDeserializationTimeNanos += timeNs; - common::updateGlobalSpillDeserializationTimeNs(timeNs); + velox::common::updateGlobalSpillDeserializationTimeNs(timeNs); return true; } void SpillReadFile::recordSpillStats() { VELOX_CHECK(input_->atEnd()); const auto readStats = input_->stats(); - common::updateGlobalSpillReadStats( + velox::common::updateGlobalSpillReadStats( readStats.numReads, readStats.readBytes, readStats.readTimeNs); auto lockedSpillStats = stats_->wlock(); lockedSpillStats->spillReads += readStats.numReads; diff --git a/velox/exec/SpillFile.h b/velox/exec/SpillFile.h index f40b5c787592..d821a3c2736a 100644 --- a/velox/exec/SpillFile.h +++ b/velox/exec/SpillFile.h @@ -87,7 +87,7 @@ struct SpillFileInfo { /// The file size in bytes. uint64_t size; std::vector sortingKeys; - common::CompressionKind compressionKind; + velox::common::CompressionKind compressionKind; }; using SpillFiles = std::vector; @@ -106,9 +106,9 @@ class SpillWriterBase { uint64_t targetFileSize, const std::string& pathPrefix, const std::string& fileCreateConfig, - common::UpdateAndCheckSpillLimitCB& updateAndCheckSpillLimitCb, + velox::common::UpdateAndCheckSpillLimitCB& updateAndCheckSpillLimitCb, memory::MemoryPool* pool, - folly::Synchronized* stats); + folly::Synchronized* stats); virtual ~SpillWriterBase() = default; @@ -151,7 +151,7 @@ class SpillWriterBase { memory::MemoryPool* const pool_; - folly::Synchronized* const stats_; + folly::Synchronized* const stats_; std::unique_ptr currentFile_; @@ -181,7 +181,7 @@ class SpillWriterBase { // Updates the aggregated spill bytes of this query, and throws if exceeds // the max spill bytes limit. - const common::UpdateAndCheckSpillLimitCB updateAndCheckSpillLimitCb_; + const velox::common::UpdateAndCheckSpillLimitCB updateAndCheckSpillLimitCb_; const std::string fileCreateConfig_; @@ -214,14 +214,14 @@ class SpillWriter : public SpillWriterBase { SpillWriter( const RowTypePtr& type, const std::vector& sortingKeys, - common::CompressionKind compressionKind, + velox::common::CompressionKind compressionKind, const std::string& pathPrefix, uint64_t targetFileSize, uint64_t writeBufferSize, const std::string& fileCreateConfig, - common::UpdateAndCheckSpillLimitCB& updateAndCheckSpillLimitCb, + velox::common::UpdateAndCheckSpillLimitCB& updateAndCheckSpillLimitCb, memory::MemoryPool* pool, - folly::Synchronized* stats); + folly::Synchronized* stats); /// Adds 'rows' for the positions in 'indices' into 'this'. The indices /// must produce a view where the rows are sorted if sorting is desired. @@ -257,7 +257,7 @@ class SpillWriter : public SpillWriterBase { const std::vector sortingKeys_; - const common::CompressionKind compressionKind_; + const velox::common::CompressionKind compressionKind_; VectorSerde* const serde_; @@ -277,7 +277,7 @@ class SpillReadFile { const SpillFileInfo& fileInfo, uint64_t bufferSize, memory::MemoryPool* pool, - folly::Synchronized* stats); + folly::Synchronized* stats); uint32_t id() const { return id_; @@ -306,9 +306,9 @@ class SpillReadFile { uint64_t bufferSize, const RowTypePtr& type, const std::vector& sortingKeys, - common::CompressionKind compressionKind, + velox::common::CompressionKind compressionKind, memory::MemoryPool* pool, - folly::Synchronized* stats); + folly::Synchronized* stats); // Invoked to record spill read stats at the end of read input. void recordSpillStats(); @@ -322,11 +322,11 @@ class SpillReadFile { // The data type of spilled data. const RowTypePtr type_; const std::vector sortingKeys_; - const common::CompressionKind compressionKind_; + const velox::common::CompressionKind compressionKind_; const serializer::presto::PrestoVectorSerde::PrestoOptions readOptions_; memory::MemoryPool* const pool_; VectorSerde* const serde_; - folly::Synchronized* const stats_; + folly::Synchronized* const stats_; std::unique_ptr input_; }; diff --git a/velox/exec/Spiller.cpp b/velox/exec/Spiller.cpp index d8dcec644e80..b58e96b8a393 100644 --- a/velox/exec/Spiller.cpp +++ b/velox/exec/Spiller.cpp @@ -36,8 +36,8 @@ SpillerBase::SpillerBase( uint64_t targetFileSize, uint64_t maxSpillRunRows, std::optional parentId, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats) + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats) : container_(container), executor_(spillConfig->executor), bits_(bits), @@ -308,12 +308,12 @@ void SpillerBase::extractSpill( void SpillerBase::updateSpillExtractVectorTime(uint64_t timeNs) { spillStats_->wlock()->spillExtractVectorTimeNanos += timeNs; - common::updateGlobalSpillExtractVectorTime(timeNs); + velox::common::updateGlobalSpillExtractVectorTime(timeNs); } void SpillerBase::updateSpillSortTime(uint64_t timeNs) { spillStats_->wlock()->spillSortTimeNanos += timeNs; - common::updateGlobalSpillSortTime(timeNs); + velox::common::updateGlobalSpillSortTime(timeNs); } void SpillerBase::checkEmptySpillRuns() const { @@ -327,7 +327,7 @@ void SpillerBase::checkEmptySpillRuns() const { void SpillerBase::updateSpillFillTime(uint64_t timeNs) { spillStats_->wlock()->spillFillTimeNanos += timeNs; - common::updateGlobalSpillFillTime(timeNs); + velox::common::updateGlobalSpillFillTime(timeNs); } void SpillerBase::finishSpill(SpillPartitionSet& partitionSet) { @@ -387,8 +387,8 @@ NoRowContainerSpiller::NoRowContainerSpiller( std::optional parentId, HashBitRange bits, const std::vector& sortingKeys, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats) + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats) : SpillerBase( nullptr, std::move(rowType), @@ -404,8 +404,8 @@ NoRowContainerSpiller::NoRowContainerSpiller( RowTypePtr rowType, std::optional parentId, HashBitRange bits, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats) + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats) : NoRowContainerSpiller( std::move(rowType), parentId, @@ -434,8 +434,8 @@ void SortInputSpiller::spill() { SortOutputSpiller::SortOutputSpiller( RowContainer* container, RowTypePtr rowType, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats) + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats) : SpillerBase( container, std::move(rowType), diff --git a/velox/exec/Spiller.h b/velox/exec/Spiller.h index 5fa5a913fa7c..7dee467d483a 100644 --- a/velox/exec/Spiller.h +++ b/velox/exec/Spiller.h @@ -46,7 +46,7 @@ class SpillerBase { return finalized_; } - common::SpillStats stats() const; + velox::common::SpillStats stats() const; std::string toString() const; @@ -59,8 +59,8 @@ class SpillerBase { uint64_t targetFileSize, uint64_t maxSpillRunRows, std::optional parentId, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats); + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats); // Invoked to spill. If 'startRowIter' is not null, then we only spill rows // from row container starting at the offset pointed by 'startRowIter'. @@ -148,7 +148,7 @@ class SpillerBase { const std::optional parentId_; - folly::Synchronized* const spillStats_; + folly::Synchronized* const spillStats_; const std::vector compareFlags_; @@ -207,15 +207,15 @@ class NoRowContainerSpiller : public SpillerBase { std::optional parentId, HashBitRange bits, const std::vector& sortingKeys, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats); + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats); NoRowContainerSpiller( RowTypePtr rowType, std::optional parentId, HashBitRange bits, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats); + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats); void spill( const SpillPartitionId& partitionId, @@ -245,8 +245,8 @@ class SortInputSpiller : public SpillerBase { RowContainer* container, RowTypePtr rowType, const std::vector& sortingKeys, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats) + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats) : SpillerBase( container, std::move(rowType), @@ -277,8 +277,8 @@ class SortOutputSpiller : public SpillerBase { SortOutputSpiller( RowContainer* container, RowTypePtr rowType, - const common::SpillConfig* spillConfig, - folly::Synchronized* spillStats); + const velox::common::SpillConfig* spillConfig, + folly::Synchronized* spillStats); void spill(SpillRows& rows); diff --git a/velox/exec/Split.h b/velox/exec/Split.h index 1ccdcaeba161..248816a16b1f 100644 --- a/velox/exec/Split.h +++ b/velox/exec/Split.h @@ -15,12 +15,12 @@ */ #pragma once -#include "velox/connectors/Connector.h" +#include "velox/connectors/common/Connector.h" namespace facebook::velox::exec { struct Split { - std::shared_ptr connectorSplit{nullptr}; + std::shared_ptr connectorSplit{nullptr}; int32_t groupId{-1}; // Bucketed group id (-1 means 'none'). /// Indicates if this is a barrier split. A barrier split is used by task @@ -31,7 +31,7 @@ struct Split { Split() = default; explicit Split( - std::shared_ptr&& connectorSplit, + std::shared_ptr&& connectorSplit, int32_t groupId = -1) : connectorSplit(std::move(connectorSplit)), groupId(groupId) {} diff --git a/velox/exec/TableScan.cpp b/velox/exec/TableScan.cpp index 601dd4381d31..20542b263e6f 100644 --- a/velox/exec/TableScan.cpp +++ b/velox/exec/TableScan.cpp @@ -46,7 +46,7 @@ TableScan::TableScan( driverCtx_->driverId, operatorType(), tableHandle_->connectorId())), - connector_(connector::getConnector(tableHandle_->connectorId())), + connector_(connector::common::getConnector(tableHandle_->connectorId())), getOutputTimeLimitMs_( driverCtx_->queryConfig().tableScanGetOutputTimeLimitMs()), scaledController_(driverCtx_->task->getScaledScanControllerLocked( @@ -128,7 +128,7 @@ RowVectorPtr TableScan::getOutput() { } const auto estimatedRowSize = dataSource_->estimatedRowSize(); readBatchSize_ = - estimatedRowSize == connector::DataSource::kUnknownRowSize + estimatedRowSize == connector::common::DataSource::kUnknownRowSize ? outputBatchRows() : outputBatchRows(estimatedRowSize); } @@ -355,13 +355,13 @@ void TableScan::tryScaleUp() { } void TableScan::preload( - const std::shared_ptr& split) { + const std::shared_ptr& split) { // The AsyncSource returns a unique_ptr to the shared_ptr of the // DataSource. The callback may outlive the Task, hence it captures // a shared_ptr to it. This is required to keep memory pools live // for the duration. The callback checks for task cancellation to // avoid needless work. - split->dataSource = std::make_unique>( + split->dataSource = std::make_unique>( [type = outputType_, table = tableHandle_, columns = columnHandles_, @@ -370,7 +370,7 @@ void TableScan::preload( split->connectorId, planNodeId(), connectorPool_), task = operatorCtx_->task(), dynamicFilters = dynamicFilters_, - split]() -> std::unique_ptr { + split]() -> std::unique_ptr { if (task->isCancelled()) { return nullptr; } @@ -407,7 +407,7 @@ void TableScan::checkPreload() { if (!splitPreloader_) { splitPreloader_ = [executor, - this](const std::shared_ptr& split) { + this](const std::shared_ptr& split) { preload(split); executor->add([connectorSplit = split]() mutable { @@ -426,7 +426,7 @@ bool TableScan::isFinished() { void TableScan::addDynamicFilter( const core::PlanNodeId& producer, column_index_t outputChannel, - const std::shared_ptr& filter) { + const std::shared_ptr& filter) { if (dataSource_) { dataSource_->addDynamicFilter(outputChannel, filter); } diff --git a/velox/exec/TableScan.h b/velox/exec/TableScan.h index 64c83c88f16f..418980b33caf 100644 --- a/velox/exec/TableScan.h +++ b/velox/exec/TableScan.h @@ -53,7 +53,7 @@ class TableScan : public SourceOperator { void addDynamicFilter( const core::PlanNodeId& producer, column_index_t outputChannel, - const std::shared_ptr& filter) override; + const std::shared_ptr& filter) override; /// The name of runtime stats specific to table scan. /// The number of running table scan drivers. @@ -88,7 +88,7 @@ class TableScan : public SourceOperator { // read 'split'. This source will be prepared in the background on the // executor of the connector. If the DataSource is needed before prepare is // done, it will be made when needed. - void preload(const std::shared_ptr& split); + void preload(const std::shared_ptr& split); // Invoked by scan operator to check if it needs to stop to wait for scale up. bool shouldWaitForScaleUp(); @@ -98,16 +98,16 @@ class TableScan : public SourceOperator { // processing or not. void tryScaleUp(); - const std::shared_ptr tableHandle_; + const std::shared_ptr tableHandle_; const std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> columnHandles_; DriverCtx* const driverCtx_; const int32_t maxSplitPreloadPerDriver_{0}; const vector_size_t maxReadBatchSize_; memory::MemoryPool* const connectorPool_; - const std::shared_ptr connector_; + const std::shared_ptr connector_; // Exits getOutput() method after this many milliseconds. Zero means 'no // limit'. const size_t getOutputTimeLimitMs_{0}; @@ -122,11 +122,11 @@ class TableScan : public SourceOperator { BlockingReason blockingReason_{BlockingReason::kNotBlocked}; int64_t currentSplitWeight_{0}; bool needNewSplit_ = true; - std::shared_ptr connectorQueryCtx_; - std::unique_ptr dataSource_; + std::shared_ptr connectorQueryCtx_; + std::unique_ptr dataSource_; bool noMoreSplits_ = false; // Dynamic filters to add to the data source when it gets created. - std::unordered_map> + std::unordered_map> dynamicFilters_; int32_t maxPreloadedSplits_{0}; @@ -135,7 +135,7 @@ class TableScan : public SourceOperator { // callback's lifetime is the lifetime of 'this'. This callback can schedule // preloads on an executor. These preloads may outlive the Task and therefore // need to capture a shared_ptr to it. - std::function&)> + std::function&)> splitPreloader_{nullptr}; // Count of splits that started background preload. diff --git a/velox/exec/TableWriter.cpp b/velox/exec/TableWriter.cpp index 6ce173412350..796b89993564 100644 --- a/velox/exec/TableWriter.cpp +++ b/velox/exec/TableWriter.cpp @@ -58,7 +58,7 @@ TableWriter::TableWriter( operatorId, driverCtx, tableWriteNode->aggregationNode()); } const auto& connectorId = tableWriteNode->insertTableHandle()->connectorId(); - connector_ = connector::getConnector(connectorId); + connector_ = connector::common::getConnector(connectorId); connectorQueryCtx_ = operatorCtx_->createConnectorQueryCtx( connectorId, planNodeId(), @@ -276,7 +276,7 @@ std::string TableWriter::createTableCommitContext(bool lastOutput) { // clang-format on } -void TableWriter::updateStats(const connector::DataSink::Stats& stats) { +void TableWriter::updateStats(const connector::common::DataSink::Stats& stats) { const auto currentTimeNs = getCurrentTimeNano(); VELOX_CHECK_GE(currentTimeNs, createTimeUs_); { @@ -341,7 +341,7 @@ void TableWriter::setConnectorMemoryReclaimer() { std::unique_ptr TableWriter::ConnectorReclaimer::create( - const std::optional& spillConfig, + const std::optional& spillConfig, DriverCtx* driverCtx, Operator* op) { return std::unique_ptr( diff --git a/velox/exec/TableWriter.h b/velox/exec/TableWriter.h index 70505cb3fb5e..43fdb3dc819e 100644 --- a/velox/exec/TableWriter.h +++ b/velox/exec/TableWriter.h @@ -160,7 +160,7 @@ class TableWriter : public Operator { class ConnectorReclaimer : public exec::ParallelMemoryReclaimer { public: static std::unique_ptr create( - const std::optional& spillConfig, + const std::optional& spillConfig, DriverCtx* driverCtx, Operator* op); @@ -187,7 +187,7 @@ class TableWriter : public Operator { private: ConnectorReclaimer( - const std::optional& spillConfig, + const std::optional& spillConfig, const std::shared_ptr& driver, Operator* op) : ParallelMemoryReclaimer( @@ -210,7 +210,7 @@ class TableWriter : public Operator { void abortDataSink(); - void updateStats(const connector::DataSink::Stats& stats); + void updateStats(const connector::common::DataSink::Stats& stats); // Sets type mappings in `inputMapping_`, `mappedInputType_`, and // `mappedOutputType_`. @@ -223,18 +223,18 @@ class TableWriter : public Operator { const DriverCtx* const driverCtx_; memory::MemoryPool* const connectorPool_; - const std::shared_ptr + const std::shared_ptr insertTableHandle_; - const connector::CommitStrategy commitStrategy_; + const connector::common::CommitStrategy commitStrategy_; // Records the writer operator creation time in ns. This is used to record // the running wall time of a writer operator. This can helps to detect the // slow scaled writer scheduling in Prestissimo. const uint64_t createTimeUs_{0}; std::unique_ptr aggregation_; - std::shared_ptr connector_; - std::shared_ptr connectorQueryCtx_; - std::unique_ptr dataSink_; + std::shared_ptr connector_; + std::shared_ptr connectorQueryCtx_; + std::unique_ptr dataSink_; // Contains the mappings between input and output columns. std::vector inputMapping_; diff --git a/velox/exec/Task.cpp b/velox/exec/Task.cpp index bfa2149a7845..4ad27eab0b91 100644 --- a/velox/exec/Task.cpp +++ b/velox/exec/Task.cpp @@ -13,10 +13,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "velox/exec/Task.h" + +#include + #include #include #include -#include #include "velox/common/base/Counters.h" #include "velox/common/base/StatsReporter.h" @@ -31,7 +34,6 @@ #include "velox/exec/OperatorUtils.h" #include "velox/exec/OutputBufferManager.h" #include "velox/exec/PlanNodeStats.h" -#include "velox/exec/Task.h" #include "velox/exec/TraceUtil.h" using facebook::velox::common::testutil::TestValue; @@ -579,8 +581,9 @@ velox::memory::MemoryPool* Task::addOperatorPool( } else { nodePool = getOrAddNodePool(planNodeId); } - childPools_.push_back(nodePool->addLeafChild(fmt::format( - "op.{}.{}.{}.{}", planNodeId, pipelineId, driverId, operatorType))); + childPools_.push_back(nodePool->addLeafChild( + fmt::format( + "op.{}.{}.{}.{}", planNodeId, pipelineId, driverId, operatorType))); return childPools_.back().get(); } @@ -591,13 +594,14 @@ velox::memory::MemoryPool* Task::addConnectorPoolLocked( const std::string& operatorType, const std::string& connectorId) { auto* nodePool = getOrAddNodePool(planNodeId); - childPools_.push_back(nodePool->addAggregateChild(fmt::format( - "op.{}.{}.{}.{}.{}", - planNodeId, - pipelineId, - driverId, - operatorType, - connectorId))); + childPools_.push_back(nodePool->addAggregateChild( + fmt::format( + "op.{}.{}.{}.{}.{}", + planNodeId, + pipelineId, + driverId, + operatorType, + connectorId))); return childPools_.back().get(); } @@ -2221,7 +2225,7 @@ template std::shared_ptr Task::getJoinBridgeInternalLocked( uint32_t splitGroupId, const core::PlanNodeId& planNodeId, - MemberType SplitGroupState::*bridges_member) { + MemberType SplitGroupState::* bridges_member) { const auto& splitGroupState = splitGroupStates_[splitGroupId]; auto it = (splitGroupState.*bridges_member).find(planNodeId); @@ -2881,8 +2885,9 @@ void Task::createLocalExchangeQueuesLocked( queryCtx_->queryConfig().maxLocalExchangeBufferSize()); exchange.queues.reserve(numPartitions); for (auto i = 0; i < numPartitions; ++i) { - exchange.queues.emplace_back(std::make_shared( - exchange.memoryManager, exchange.vectorPool, i)); + exchange.queues.emplace_back( + std::make_shared( + exchange.memoryManager, exchange.vectorPool, i)); } const auto partitionNode = @@ -2890,7 +2895,7 @@ void Task::createLocalExchangeQueuesLocked( VELOX_CHECK_NOT_NULL(partitionNode); if (partitionNode->scaleWriter()) { exchange.scaleWriterPartitionBalancer = - std::make_shared( + std::make_shared( queryCtx_->queryConfig().scaleWriterMaxPartitionsPerWriter() * numPartitions, numPartitions, @@ -2942,7 +2947,7 @@ Task::getLocalExchangeQueues( return it->second.queues; } -const std::shared_ptr& +const std::shared_ptr& Task::getScaleWriterPartitionBalancer( uint32_t splitGroupId, const core::PlanNodeId& planNodeId) { @@ -3574,8 +3579,8 @@ bool Task::DriverBlockingState::blocked(ContinueFuture* future) { VELOX_CHECK(promises_.empty()); return false; } - auto [blockPromise, blockFuture] = - makeVeloxContinuePromiseContract(fmt::format( + auto [blockPromise, blockFuture] = makeVeloxContinuePromiseContract( + fmt::format( "DriverBlockingState {} from task {}", driver_->driverCtx()->driverId, driver_->task()->taskId())); diff --git a/velox/exec/Task.h b/velox/exec/Task.h index e7ef81617a4b..a0b775a369a9 100644 --- a/velox/exec/Task.h +++ b/velox/exec/Task.h @@ -37,8 +37,8 @@ class OutputBufferManager; class HashJoinBridge; class NestedLoopJoinBridge; -using ConnectorSplitPreloadFunc = - std::function&)>; +using ConnectorSplitPreloadFunc = std::function&)>; class Task : public std::enable_shared_from_this { public: @@ -504,7 +504,7 @@ class Task : public std::enable_shared_from_this { /// Returns the shared skewed partition balancer for scale writer local /// partitioning with the given split group id and plan node id. - const std::shared_ptr& + const std::shared_ptr& getScaleWriterPartitionBalancer( uint32_t splitGroupId, const core::PlanNodeId& planNodeId); @@ -954,7 +954,7 @@ class Task : public std::enable_shared_from_this { std::shared_ptr getJoinBridgeInternalLocked( uint32_t splitGroupId, const core::PlanNodeId& planNodeId, - MemberType SplitGroupState::*bridges_member); + MemberType SplitGroupState::* bridges_member); std::shared_ptr getCustomJoinBridgeInternal( uint32_t splitGroupId, @@ -1382,7 +1382,7 @@ class Task : public std::enable_shared_from_this { std::atomic spillDirectoryCreated_{false}; // Stores unconsumed preloading splits to ensure they are closed promptly. - folly::F14FastSet> + folly::F14FastSet> preloadingSplits_; folly::CancellationSource cancellationSource_; diff --git a/velox/exec/TaskStructs.h b/velox/exec/TaskStructs.h index bf3418a3f25b..8e8180602128 100644 --- a/velox/exec/TaskStructs.h +++ b/velox/exec/TaskStructs.h @@ -90,7 +90,7 @@ struct LocalExchangeState { std::shared_ptr memoryManager; std::shared_ptr vectorPool; std::vector> queues; - std::shared_ptr + std::shared_ptr scaleWriterPartitionBalancer; }; diff --git a/velox/exec/VectorHasher.cpp b/velox/exec/VectorHasher.cpp index dc4c921fb035..73ab9d8c99f8 100644 --- a/velox/exec/VectorHasher.cpp +++ b/velox/exec/VectorHasher.cpp @@ -664,7 +664,7 @@ void VectorHasher::setRangeOverflow() { hasRange_ = false; } -std::unique_ptr VectorHasher::getFilter( +std::unique_ptr VectorHasher::getFilter( bool nullAllowed) const { switch (typeKind_) { case TypeKind::TINYINT: @@ -681,7 +681,7 @@ std::unique_ptr VectorHasher::getFilter( values.emplace_back(value.data()); } - return common::createBigintValues(values, nullAllowed); + return velox::common::createBigintValues(values, nullAllowed); } [[fallthrough]]; default: diff --git a/velox/exec/VectorHasher.h b/velox/exec/VectorHasher.h index 6a18df611735..01e534af46bd 100644 --- a/velox/exec/VectorHasher.h +++ b/velox/exec/VectorHasher.h @@ -242,7 +242,7 @@ class VectorHasher { // Returns an instance of the filter corresponding to a set of unique values. // Returns null if distinctOverflow_ is true. - std::unique_ptr getFilter(bool nullAllowed) const; + std::unique_ptr getFilter(bool nullAllowed) const; void resetStats() { uniqueValues_.clear(); diff --git a/velox/exec/Window.cpp b/velox/exec/Window.cpp index 1f1a91331a95..b8853d010712 100644 --- a/velox/exec/Window.cpp +++ b/velox/exec/Window.cpp @@ -58,7 +58,7 @@ Window::Window( windowBuild_ = std::make_unique( windowNode, pool(), - common::PrefixSortConfig{ + velox::common::PrefixSortConfig{ driverCtx->queryConfig().prefixSortNormalizedKeyMaxBytes(), driverCtx->queryConfig().prefixSortMinRows(), driverCtx->queryConfig().prefixSortMaxStringPrefixLength()}, diff --git a/velox/exec/WindowBuild.cpp b/velox/exec/WindowBuild.cpp index 2c5096be472c..584ee1bd6a97 100644 --- a/velox/exec/WindowBuild.cpp +++ b/velox/exec/WindowBuild.cpp @@ -86,7 +86,7 @@ slice(const std::vector& types, int32_t start, int32_t end) { WindowBuild::WindowBuild( const std::shared_ptr& windowNode, velox::memory::MemoryPool* pool, - const common::SpillConfig* spillConfig, + const velox::common::SpillConfig* spillConfig, tsan_atomic* nonReclaimableSection) : spillConfig_{spillConfig}, nonReclaimableSection_{nonReclaimableSection}, diff --git a/velox/exec/WindowBuild.h b/velox/exec/WindowBuild.h index 01c470803ed7..8a7087471747 100644 --- a/velox/exec/WindowBuild.h +++ b/velox/exec/WindowBuild.h @@ -31,7 +31,7 @@ class WindowBuild { WindowBuild( const std::shared_ptr& windowNode, velox::memory::MemoryPool* pool, - const common::SpillConfig* spillConfig, + const velox::common::SpillConfig* spillConfig, tsan_atomic* nonReclaimableSection); virtual ~WindowBuild() = default; @@ -48,7 +48,7 @@ class WindowBuild { virtual void spill() = 0; /// Returns the spiller stats including total bytes and rows spilled so far. - virtual std::optional spilledStats() const = 0; + virtual std::optional spilledStats() const = 0; /// The Window operator invokes this function to indicate that no more input /// rows will be passed from the Window operator to the WindowBuild. When @@ -84,7 +84,7 @@ class WindowBuild { const char* rhs, const std::vector>& keys); - const common::SpillConfig* const spillConfig_; + const velox::common::SpillConfig* const spillConfig_; tsan_atomic* const nonReclaimableSection_; /// The below 2 vectors represent the ChannelIndex of the partition keys diff --git a/velox/exec/benchmarks/PrefixSortBenchmark.cpp b/velox/exec/benchmarks/PrefixSortBenchmark.cpp index 28fc38325ac6..b9deac038e55 100644 --- a/velox/exec/benchmarks/PrefixSortBenchmark.cpp +++ b/velox/exec/benchmarks/PrefixSortBenchmark.cpp @@ -102,13 +102,13 @@ class TestCase { // You could config threshold, e.i. 0, to test prefix-sort for small // dateset. -static const common::PrefixSortConfig kDefaultSortConfig(1024, 100, 50); +static const velox::common::PrefixSortConfig kDefaultSortConfig(1024, 100, 50); // For small dataset, in some test environments, if std-sort is defined in the // benchmark file, the test results may be strangely regressed. When the // threshold is particularly large, PrefixSort is actually std-sort, hence, we // can use this as std-sort benchmark base. -static const common::PrefixSortConfig +static const velox::common::PrefixSortConfig kStdSortConfig(1024, std::numeric_limits::max(), 50); class PrefixSortBenchmark { diff --git a/velox/exec/fuzzer/AggregationFuzzer.cpp b/velox/exec/fuzzer/AggregationFuzzer.cpp index fd98a8d8a83d..1e29250afe11 100644 --- a/velox/exec/fuzzer/AggregationFuzzer.cpp +++ b/velox/exec/fuzzer/AggregationFuzzer.cpp @@ -266,7 +266,7 @@ AggregationFuzzer::AggregationFuzzer( void AggregationFuzzer::go(const std::string& planPath) { Type::registerSerDe(); connector::hive::HiveTableHandle::registerSerDe(); - connector::hive::LocationHandle::registerSerDe(); + connector::hive::HiveLocationHandle::registerSerDe(); connector::hive::HiveColumnHandle::registerSerDe(); connector::hive::HiveInsertTableHandle::registerSerDe(); core::ITypedExpr::registerSerDe(); diff --git a/velox/exec/fuzzer/AggregationFuzzerBase.cpp b/velox/exec/fuzzer/AggregationFuzzerBase.cpp index 2bd16248fbdd..08ac353c45c9 100644 --- a/velox/exec/fuzzer/AggregationFuzzerBase.cpp +++ b/velox/exec/fuzzer/AggregationFuzzerBase.cpp @@ -761,7 +761,7 @@ void persistReproInfo( // Create a new directory const auto dirPathOptional = - common::generateTempFolderPath(basePath.c_str(), "aggregationVerifier"); + velox::common::generateTempFolderPath(basePath.c_str(), "aggregationVerifier"); if (!dirPathOptional.has_value()) { LOG(ERROR) << "Failed to create directory for persisting plans using base path: " diff --git a/velox/exec/fuzzer/AggregationFuzzerBase.h b/velox/exec/fuzzer/AggregationFuzzerBase.h index 668fc9f6ecb6..cd130ae1d636 100644 --- a/velox/exec/fuzzer/AggregationFuzzerBase.h +++ b/velox/exec/fuzzer/AggregationFuzzerBase.h @@ -81,7 +81,7 @@ class AggregationFuzzerBase { : getFuzzerOptions(timestampPrecision), pool_.get()} { filesystems::registerLocalFileSystem(); - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); registerHiveConnector(hiveConfigs); dwrf::registerDwrfReaderFactory(); diff --git a/velox/exec/fuzzer/FuzzerUtil.cpp b/velox/exec/fuzzer/FuzzerUtil.cpp index 299536905e60..7cdfde21d882 100644 --- a/velox/exec/fuzzer/FuzzerUtil.cpp +++ b/velox/exec/fuzzer/FuzzerUtil.cpp @@ -126,7 +126,7 @@ Split makeSplit( return Split{makeConnectorSplit(filePath, partitionKeys, tableBucketNumber)}; } -std::shared_ptr makeConnectorSplit( +std::shared_ptr makeConnectorSplit( const std::string& filePath, const std::unordered_map>& partitionKeys, @@ -368,16 +368,16 @@ void registerHiveConnector( auto configs = hiveConfigs; if (!connector::hasConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName)) { - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); } auto hiveConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector( kHiveConnectorId, std::make_shared(std::move(configs))); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); } std::unique_ptr setupReferenceQueryRunner( diff --git a/velox/exec/fuzzer/FuzzerUtil.h b/velox/exec/fuzzer/FuzzerUtil.h index dd8f7b37936a..ed5bcced6598 100644 --- a/velox/exec/fuzzer/FuzzerUtil.h +++ b/velox/exec/fuzzer/FuzzerUtil.h @@ -61,7 +61,7 @@ Split makeSplit( std::optional tableBucketNumber = std::nullopt); /// Create a connector split from an exsiting file. -std::shared_ptr makeConnectorSplit( +std::shared_ptr makeConnectorSplit( const std::string& filePath, const std::unordered_map>& partitionKeys = {}, diff --git a/velox/exec/fuzzer/JoinFuzzer.cpp b/velox/exec/fuzzer/JoinFuzzer.cpp index ded82a03a310..aae5f99cd0bb 100644 --- a/velox/exec/fuzzer/JoinFuzzer.cpp +++ b/velox/exec/fuzzer/JoinFuzzer.cpp @@ -205,15 +205,15 @@ JoinFuzzer::JoinFuzzer( // Make sure not to run out of open file descriptors. std::unordered_map hiveConfig = { {connector::hive::HiveConfig::kNumCacheFileHandles, "1000"}}; - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); auto hiveConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector( test::kHiveConnectorId, std::make_shared(std::move(hiveConfig))); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); dwrf::registerDwrfReaderFactory(); dwrf::registerDwrfWriterFactory(); diff --git a/velox/exec/fuzzer/MemoryArbitrationFuzzer.cpp b/velox/exec/fuzzer/MemoryArbitrationFuzzer.cpp index 4aaeee65b887..30c2cf92d481 100644 --- a/velox/exec/fuzzer/MemoryArbitrationFuzzer.cpp +++ b/velox/exec/fuzzer/MemoryArbitrationFuzzer.cpp @@ -268,15 +268,15 @@ MemoryArbitrationFuzzer::MemoryArbitrationFuzzer(size_t initialSeed) // Make sure not to run out of open file descriptors. std::unordered_map hiveConfig = { {connector::hive::HiveConfig::kNumCacheFileHandles, "1000"}}; - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); const auto hiveConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector( kHiveConnectorId, std::make_shared(std::move(hiveConfig))); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); dwrf::registerDwrfReaderFactory(); dwrf::registerDwrfWriterFactory(); diff --git a/velox/exec/fuzzer/WriterFuzzer.cpp b/velox/exec/fuzzer/WriterFuzzer.cpp index 92e31590d5e0..d89ea8bc528c 100644 --- a/velox/exec/fuzzer/WriterFuzzer.cpp +++ b/velox/exec/fuzzer/WriterFuzzer.cpp @@ -151,7 +151,7 @@ class WriterFuzzer { // Generates table column handles based on table column properties std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> getTableColumnHandles( const std::vector& names, const std::vector& types, @@ -639,7 +639,7 @@ void WriterFuzzer::verifyWriter( std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> WriterFuzzer::getTableColumnHandles( const std::vector& names, const std::vector& types, @@ -647,7 +647,7 @@ WriterFuzzer::getTableColumnHandles( const int32_t bucketCount) { std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> columnHandle; for (int i = 0; i < names.size(); ++i) { HiveColumnHandle::ColumnType columnType; diff --git a/velox/exec/fuzzer/WriterFuzzerRunner.h b/velox/exec/fuzzer/WriterFuzzerRunner.h index 527b11bbef1f..88531286e3d0 100644 --- a/velox/exec/fuzzer/WriterFuzzerRunner.h +++ b/velox/exec/fuzzer/WriterFuzzerRunner.h @@ -77,16 +77,16 @@ class WriterFuzzerRunner { std::unique_ptr referenceQueryRunner) { filesystems::registerLocalFileSystem(); tests::utils::registerFaultyFileSystem(); - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); auto hiveConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector( kHiveConnectorId, std::make_shared( std::unordered_map())); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); dwrf::registerDwrfReaderFactory(); dwrf::registerDwrfWriterFactory(); dwio::common::registerFileSinks(); diff --git a/velox/exec/tests/AddressableNonNullValueListTest.cpp b/velox/exec/tests/AddressableNonNullValueListTest.cpp index 9f150f293253..8e10838a4b65 100644 --- a/velox/exec/tests/AddressableNonNullValueListTest.cpp +++ b/velox/exec/tests/AddressableNonNullValueListTest.cpp @@ -129,7 +129,7 @@ class AddressableNonNullValueListTest : public testing::Test, // Deserialize entries from the stream. AddressableNonNullValueList deserialized; std::vector deserializedEntries; - common::InputByteStream stream(rawBuffer); + velox::common::InputByteStream stream(rawBuffer); while (stream.offset() < totalSize) { auto length = stream.read(); auto hash = stream.read(); diff --git a/velox/exec/tests/AggregateSpillBenchmarkBase.cpp b/velox/exec/tests/AggregateSpillBenchmarkBase.cpp index b8f15c3c799b..2d9b4894fe56 100644 --- a/velox/exec/tests/AggregateSpillBenchmarkBase.cpp +++ b/velox/exec/tests/AggregateSpillBenchmarkBase.cpp @@ -130,7 +130,7 @@ void AggregateSpillBenchmarkBase::writeSpillData() { } std::unique_ptr AggregateSpillBenchmarkBase::makeSpiller() { - common::SpillConfig spillConfig; + velox::common::SpillConfig spillConfig; spillConfig.getSpillDirPathCb = [&]() -> std::string_view { return spillDir_; }; diff --git a/velox/exec/tests/AggregationTest.cpp b/velox/exec/tests/AggregationTest.cpp index b1c9a36d6855..a1c355765e1e 100644 --- a/velox/exec/tests/AggregationTest.cpp +++ b/velox/exec/tests/AggregationTest.cpp @@ -42,7 +42,7 @@ namespace facebook::velox::exec::test { using core::QueryConfig; using facebook::velox::test::BatchMaker; -using namespace common::testutil; +using namespace velox::common::testutil; /// No-op implementation of Aggregate. Provides public access to following /// base class methods: setNull, clearNull and isNull. diff --git a/velox/exec/tests/AssertQueryBuilderTest.cpp b/velox/exec/tests/AssertQueryBuilderTest.cpp index 0a7d6e5df4cb..a058fb4ee685 100644 --- a/velox/exec/tests/AssertQueryBuilderTest.cpp +++ b/velox/exec/tests/AssertQueryBuilderTest.cpp @@ -93,7 +93,7 @@ TEST_F(AssertQueryBuilderTest, hiveSplits) { .assertResults("VALUES (1), (2), (3)"); // Split with partition key. - ConnectorColumnHandleMap assignments = { + connector::common::ConnectorColumnHandleMap assignments = { {"ds", partitionKey("ds", VARCHAR())}, {"c0", regularColumn("c0", BIGINT())}}; @@ -106,9 +106,17 @@ TEST_F(AssertQueryBuilderTest, hiveSplits) { .endTableScan() .planNode(), duckDbQueryRunner_) - .split(HiveConnectorSplitBuilder(file->getPath()) - .partitionKey("ds", "2022-05-10") - .build()) + .split(makeHiveConnectorSplit(file->getPath(), 0, + uint64_t start, + uint64_t length, + int64_t splitWeight = 0, + bool cacheable = true, + dwio::common::FileFormat fileFormat = dwio::common::FileFormat::DWRF, + const std::unordered_map& infoColumns = {}, + const std::unordered_map& partitionKeys = {})) +// .split(HiveConnectorSplitBuilder(file->getPath()) +// .partitionKey("ds", "2022-05-10") +// .build()) .assertResults( "VALUES (1, '2022-05-10'), (2, '2022-05-10'), (3, '2022-05-10')"); diff --git a/velox/exec/tests/AsyncConnectorTest.cpp b/velox/exec/tests/AsyncConnectorTest.cpp index 4eaf4021b399..56ec19ccb438 100644 --- a/velox/exec/tests/AsyncConnectorTest.cpp +++ b/velox/exec/tests/AsyncConnectorTest.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ #include -#include "velox/connectors/Connector.h" +#include "../../connectors/common/Connector.h" #include "velox/exec/PlanNodeStats.h" #include "velox/exec/tests/utils/OperatorTestBase.h" #include "velox/exec/tests/utils/PlanBuilder.h" @@ -30,19 +30,19 @@ namespace { const std::string kTestConnectorId = "test"; -class TestTableHandle : public connector::ConnectorTableHandle { +class TestTableHandle : public connector::common::ConnectorTableHandle { public: - TestTableHandle() : connector::ConnectorTableHandle(kTestConnectorId) {} + TestTableHandle() : connector::common::ConnectorTableHandle(kTestConnectorId) {} std::string toString() const override { VELOX_NYI(); } }; -class TestSplit : public connector::ConnectorSplit { +class TestSplit : public connector::common::ConnectorSplit { public: explicit TestSplit(uint32_t delayMs) - : connector::ConnectorSplit(kTestConnectorId), delayMs_{delayMs} { + : connector::common::ConnectorSplit(kTestConnectorId), delayMs_{delayMs} { scheduler_.start(); } @@ -70,11 +70,11 @@ class TestSplit : public connector::ConnectorSplit { velox::ContinuePromise promise_; }; -class TestDataSource : public connector::DataSource { +class TestDataSource : public connector::common::DataSource { public: explicit TestDataSource(memory::MemoryPool* pool) : pool_{pool} {} - void addSplit(std::shared_ptr split) override { + void addSplit(std::shared_ptr split) override { auto testSplit = std::dynamic_pointer_cast(split); VELOX_CHECK_NOT_NULL(testSplit); future_ = testSplit->touch(); @@ -109,7 +109,7 @@ class TestDataSource : public connector::DataSource { void addDynamicFilter( column_index_t /* outputChannel */, - const std::shared_ptr& /* filter */) override { + const std::shared_ptr& /* filter */) override { VELOX_NYI(); } @@ -131,38 +131,38 @@ class TestDataSource : public connector::DataSource { ContinueFuture future_{ContinueFuture::makeEmpty()}; }; -class TestConnector : public connector::Connector { +class TestConnector : public connector::common::Connector { public: - TestConnector(const std::string& id) : connector::Connector(id) {} + TestConnector(const std::string& id) : connector::common::Connector(id) {} - std::unique_ptr createDataSource( + std::unique_ptr createDataSource( const RowTypePtr& /* outputType */, - const std::shared_ptr& /* tableHandle */, + const std::shared_ptr& /* tableHandle */, const std::unordered_map< std::string, std::shared_ptr< - connector::ConnectorColumnHandle>>& /* columnHandles */, - connector::ConnectorQueryCtx* connectorQueryCtx) override { + connector::common::ConnectorColumnHandle>>& /* columnHandles */, + connector::common::ConnectorQueryCtx* connectorQueryCtx) override { return std::make_unique(connectorQueryCtx->memoryPool()); } - std::unique_ptr createDataSink( + std::unique_ptr createDataSink( RowTypePtr /*inputType*/, std::shared_ptr< - ConnectorInsertTableHandle> /*connectorInsertTableHandle*/, - ConnectorQueryCtx* /*connectorQueryCtx*/, - CommitStrategy /*commitStrategy*/) override final { + connector::common::ConnectorInsertTableHandle> /*connectorInsertTableHandle*/, + connector::common::ConnectorQueryCtx* /*connectorQueryCtx*/, + connector::common::CommitStrategy /*commitStrategy*/) override final { VELOX_NYI(); } }; -class TestConnectorFactory : public connector::ConnectorFactory { +class TestConnectorFactory : public connector::common::ConnectorFactory { public: static constexpr const char* kTestConnectorName = "test"; - TestConnectorFactory() : connector::ConnectorFactory(kTestConnectorName) {} + TestConnectorFactory() : connector::common::ConnectorFactory(kTestConnectorName) {} - std::shared_ptr newConnector( + std::shared_ptr newConnector( const std::string& id, std::shared_ptr config, folly::Executor* /* ioExecutor */, @@ -176,20 +176,20 @@ class AsyncConnectorTest : public OperatorTestBase { public: void SetUp() override { OperatorTestBase::SetUp(); - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); auto testConnector = - connector::getConnectorFactory(TestConnectorFactory::kTestConnectorName) + connector::common::getConnectorFactory(TestConnectorFactory::kTestConnectorName) ->newConnector( kTestConnectorId, std::make_shared( std::unordered_map()), nullptr); - connector::registerConnector(testConnector); + connector::common::registerConnector(testConnector); } void TearDown() override { - connector::unregisterConnector(kTestConnectorId); + connector::common::unregisterConnector(kTestConnectorId); OperatorTestBase::TearDown(); } }; diff --git a/velox/exec/tests/ConcatFilesSpillMergeStreamTest.cpp b/velox/exec/tests/ConcatFilesSpillMergeStreamTest.cpp index 57a91976fba6..44ac3dd08d6e 100644 --- a/velox/exec/tests/ConcatFilesSpillMergeStreamTest.cpp +++ b/velox/exec/tests/ConcatFilesSpillMergeStreamTest.cpp @@ -50,7 +50,7 @@ class ConcatFilesSpillMergeStreamTest : public OperatorTestBase { sortCompareFlags_, pool_.get(), &nonReclaimableSection_, - common::PrefixSortConfig{}, + velox::common::PrefixSortConfig{}, nullptr, nullptr); for (const auto& vector : vectors) { @@ -173,7 +173,7 @@ class ConcatFilesSpillMergeStreamTest : public OperatorTestBase { sortCompareFlags_, pool_.get(), &nonReclaimableSection_, - common::PrefixSortConfig{}, + velox::common::PrefixSortConfig{}, nullptr, nullptr); for (const auto& vector : vectors) { @@ -205,7 +205,7 @@ class ConcatFilesSpillMergeStreamTest : public OperatorTestBase { SpillState::makeSortingKeys(sortColumnIndices_, sortCompareFlags_); const std::shared_ptr spillDirectory_ = exec::test::TempDirectoryPath::create(); - const common::SpillConfig spillConfig_{ + const velox::common::SpillConfig spillConfig_{ [&]() -> const std::string& { return spillDirectory_->getPath(); }, [&](uint64_t) {}, "0.0.0", @@ -223,7 +223,7 @@ class ConcatFilesSpillMergeStreamTest : public OperatorTestBase { "none", std::nullopt}; - folly::Synchronized spillStats_; + folly::Synchronized spillStats_; tsan_atomic nonReclaimableSection_{false}; }; } // namespace facebook::velox::exec::test diff --git a/velox/exec/tests/ContainerRowSerdeTest.cpp b/velox/exec/tests/ContainerRowSerdeTest.cpp index b13f31382da7..9c4c05012670 100644 --- a/velox/exec/tests/ContainerRowSerdeTest.cpp +++ b/velox/exec/tests/ContainerRowSerdeTest.cpp @@ -326,7 +326,7 @@ TEST_F(ContainerRowSerdeTest, nested) { auto nestedArray = makeNullableNestedArrayVector( {{{{{"1", "2"}}, {{"3", "4"}}}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{std::nullopt, {}}}}); testRoundTrip(nestedArray); diff --git a/velox/exec/tests/ExchangeClientTest.cpp b/velox/exec/tests/ExchangeClientTest.cpp index 86883d2519de..b47327a0d79f 100644 --- a/velox/exec/tests/ExchangeClientTest.cpp +++ b/velox/exec/tests/ExchangeClientTest.cpp @@ -64,7 +64,7 @@ class ExchangeClientTest } bufferManager_ = OutputBufferManager::getInstanceRef(); - common::testutil::TestValue::enable(); + velox::common::testutil::TestValue::enable(); } void TearDown() override { @@ -506,7 +506,7 @@ TEST_P(ExchangeClientTest, sourceTimeout) { TEST_P(ExchangeClientTest, callNextAfterClose) { constexpr int32_t kNumSources = 3; - common::testutil::TestValue::enable(); + velox::common::testutil::TestValue::enable(); auto client = std::make_shared( "test", 17, diff --git a/velox/exec/tests/HashJoinBridgeTest.cpp b/velox/exec/tests/HashJoinBridgeTest.cpp index d5a6e5b24072..6ed3249cd7eb 100644 --- a/velox/exec/tests/HashJoinBridgeTest.cpp +++ b/velox/exec/tests/HashJoinBridgeTest.cpp @@ -121,7 +121,7 @@ class HashJoinBridgeTest : public testing::Test, tempDir_->getPath() + "/Spill_" + std::to_string(fileId), 1024, SpillState::makeSortingKeys(std::vector(1)), - common::CompressionKind_NONE}); + velox::common::CompressionKind_NONE}); } return files; } diff --git a/velox/exec/tests/HashJoinTest.cpp b/velox/exec/tests/HashJoinTest.cpp index 418dc3c2aa81..e7b8d8bcbb78 100644 --- a/velox/exec/tests/HashJoinTest.cpp +++ b/velox/exec/tests/HashJoinTest.cpp @@ -3721,7 +3721,7 @@ TEST_F(HashJoinTest, dynamicFilters) { // having different names than column names in the files. { auto scanOutputType = ROW({"a", "b"}, {INTEGER(), BIGINT()}); - ConnectorColumnHandleMap assignments; + connector::common::ConnectorColumnHandleMap assignments; assignments["a"] = regularColumn("c0", INTEGER()); assignments["b"] = regularColumn("c1", BIGINT()); @@ -4500,7 +4500,7 @@ TEST_F(HashJoinTest, dynamicFiltersAppliedToPreloadedSplits) { } auto outputType = ROW({"p0", "p1"}, {BIGINT(), BIGINT()}); - ConnectorColumnHandleMap assignments = { + connector::common::ConnectorColumnHandleMap assignments = { {"p0", regularColumn("p0", BIGINT())}, {"p1", partitionKey("p1", BIGINT())}}; createDuckDbTable("p", probeVectors); @@ -4894,7 +4894,7 @@ TEST_F(HashJoinTest, dynamicFilterOnPartitionKey) { .partitionKey("k", "0") .build(); auto outputType = ROW({"n1_0", "n1_1"}, {BIGINT(), BIGINT()}); - ConnectorColumnHandleMap assignments = { + connector::common::ConnectorColumnHandleMap assignments = { {"n1_0", regularColumn("c0", BIGINT())}, {"n1_1", partitionKey("k", BIGINT())}}; @@ -6267,7 +6267,7 @@ DEBUG_ONLY_TEST_F(HashJoinTest, exceededMaxSpillLevel) { auto tempDirectory = exec::test::TempDirectoryPath::create(); const int exceededMaxSpillLevelCount = - common::globalSpillStats().spillMaxLevelExceededCount; + velox::common::globalSpillStats().spillMaxLevelExceededCount; SCOPED_TESTVALUE_SET( "facebook::velox::exec::HashBuild::reclaim", std::function(([&](exec::Operator* op) { @@ -6321,7 +6321,7 @@ DEBUG_ONLY_TEST_F(HashJoinTest, exceededMaxSpillLevel) { }) .run(); ASSERT_EQ( - common::globalSpillStats().spillMaxLevelExceededCount, + velox::common::globalSpillStats().spillMaxLevelExceededCount, exceededMaxSpillLevelCount + 16); } diff --git a/velox/exec/tests/HashTableTest.cpp b/velox/exec/tests/HashTableTest.cpp index bee323d38f8f..5613de1ec76e 100644 --- a/velox/exec/tests/HashTableTest.cpp +++ b/velox/exec/tests/HashTableTest.cpp @@ -95,7 +95,7 @@ class HashTableTest : public testing::TestWithParam, } void SetUp() override { - common::testutil::TestValue::enable(); + velox::common::testutil::TestValue::enable(); if (GetParam()) { executor_ = std::make_unique(16); } diff --git a/velox/exec/tests/IndexLookupJoinTest.cpp b/velox/exec/tests/IndexLookupJoinTest.cpp index f411ad29df56..81e0cf0e40af 100644 --- a/velox/exec/tests/IndexLookupJoinTest.cpp +++ b/velox/exec/tests/IndexLookupJoinTest.cpp @@ -15,12 +15,12 @@ */ #include "velox/exec/IndexLookupJoin.h" +#include "../../connectors/common/Connector.h" #include "folly/experimental/EventCount.h" #include "gmock/gmock.h" #include "gtest/gtest-matchers.h" #include "velox/common/base/tests/GTestUtils.h" #include "velox/common/testutil/TestValue.h" -#include "velox/connectors/Connector.h" #include "velox/core/PlanNode.h" #include "velox/exec/PlanNodeStats.h" #include "velox/exec/tests/utils/AssertQueryBuilder.h" @@ -80,16 +80,16 @@ class IndexLookupJoinTest : public IndexLookupJoinTestBase, connector::hive::HiveColumnHandle::registerSerDe(); Type::registerSerDe(); core::ITypedExpr::registerSerDe(); - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); - std::shared_ptr connector = - connector::getConnectorFactory(kTestIndexConnectorName) + std::shared_ptr connector = + connector::common::getConnectorFactory(kTestIndexConnectorName) ->newConnector( kTestIndexConnectorName, {}, nullptr, connectorCpuExecutor_.get()); - connector::registerConnector(connector); + connector::common::registerConnector(connector); keyType_ = ROW({"u0", "u1", "u2"}, {BIGINT(), BIGINT(), BIGINT()}); valueType_ = ROW({"u3", "u4", "u5"}, {BIGINT(), BIGINT(), VARCHAR()}); @@ -102,8 +102,8 @@ class IndexLookupJoinTest : public IndexLookupJoinTestBase, } void TearDown() override { - connector::unregisterConnectorFactory(kTestIndexConnectorName); - connector::unregisterConnector(kTestIndexConnectorName); + connector::common::unregisterConnectorFactory(kTestIndexConnectorName); + connector::common::unregisterConnector(kTestIndexConnectorName); HiveConnectorTestBase::TearDown(); } @@ -803,7 +803,7 @@ TEST_P(IndexLookupJoinTest, equalJoin) { auto planNodeIdGenerator = std::make_shared(); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, @@ -1260,7 +1260,7 @@ TEST_P(IndexLookupJoinTest, betweenJoinCondition) { auto planNodeIdGenerator = std::make_shared(); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, @@ -1583,7 +1583,7 @@ TEST_P(IndexLookupJoinTest, inJoinCondition) { auto planNodeIdGenerator = std::make_shared(); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, @@ -1636,7 +1636,7 @@ DEBUG_ONLY_TEST_P(IndexLookupJoinTest, connectorError) { auto planNodeIdGenerator = std::make_shared(); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, @@ -1707,7 +1707,7 @@ DEBUG_ONLY_TEST_P(IndexLookupJoinTest, prefetch) { auto planNodeIdGenerator = std::make_shared(); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, @@ -1798,7 +1798,7 @@ TEST_P(IndexLookupJoinTest, outputBatchSize) { auto planNodeIdGenerator = std::make_shared(); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, @@ -1869,7 +1869,7 @@ DEBUG_ONLY_TEST_P(IndexLookupJoinTest, runtimeStats) { auto planNodeIdGenerator = std::make_shared(); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, @@ -1958,7 +1958,7 @@ TEST_P(IndexLookupJoinTest, barrier) { auto planNodeIdGenerator = std::make_shared(); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, @@ -2035,7 +2035,7 @@ TEST_P(IndexLookupJoinTest, joinFuzzer) { std::shuffle(scanOutput.begin(), scanOutput.end(), g); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> columnHandles; const auto indexScanNode = makeIndexScanNode( planNodeIdGenerator, diff --git a/velox/exec/tests/JoinSpillInputBenchmarkBase.cpp b/velox/exec/tests/JoinSpillInputBenchmarkBase.cpp index 15fa1befed5f..5d06573044d2 100644 --- a/velox/exec/tests/JoinSpillInputBenchmarkBase.cpp +++ b/velox/exec/tests/JoinSpillInputBenchmarkBase.cpp @@ -31,7 +31,7 @@ const int numSampleVectors = 100; void JoinSpillInputBenchmarkBase::setUp() { SpillerBenchmarkBase::setUp(); - common::SpillConfig spillConfig; + velox::common::SpillConfig spillConfig; spillConfig.getSpillDirPathCb = [&]() -> std::string_view { return spillDir_; }; diff --git a/velox/exec/tests/MultiFragmentTest.cpp b/velox/exec/tests/MultiFragmentTest.cpp index 52c46c20180a..6094f467c1dc 100644 --- a/velox/exec/tests/MultiFragmentTest.cpp +++ b/velox/exec/tests/MultiFragmentTest.cpp @@ -39,11 +39,11 @@ namespace { struct TestParam { VectorSerde::Kind serdeKind; - common::CompressionKind compressionKind; + velox::common::CompressionKind compressionKind; TestParam( VectorSerde::Kind _serdeKind, - common::CompressionKind _compressionKind) + velox::common::CompressionKind _compressionKind) : serdeKind(_serdeKind), compressionKind(_compressionKind) {} }; @@ -53,17 +53,17 @@ class MultiFragmentTest : public HiveConnectorTestBase, static std::vector getTestParams() { std::vector params; params.emplace_back( - VectorSerde::Kind::kPresto, common::CompressionKind_NONE); + VectorSerde::Kind::kPresto, velox::common::CompressionKind_NONE); params.emplace_back( - VectorSerde::Kind::kCompactRow, common::CompressionKind_NONE); + VectorSerde::Kind::kCompactRow, velox::common::CompressionKind_NONE); params.emplace_back( - VectorSerde::Kind::kUnsafeRow, common::CompressionKind_NONE); + VectorSerde::Kind::kUnsafeRow, velox::common::CompressionKind_NONE); params.emplace_back( - VectorSerde::Kind::kPresto, common::CompressionKind_LZ4); + VectorSerde::Kind::kPresto, velox::common::CompressionKind_LZ4); params.emplace_back( - VectorSerde::Kind::kCompactRow, common::CompressionKind_LZ4); + VectorSerde::Kind::kCompactRow, velox::common::CompressionKind_LZ4); params.emplace_back( - VectorSerde::Kind::kUnsafeRow, common::CompressionKind_LZ4); + VectorSerde::Kind::kUnsafeRow, velox::common::CompressionKind_LZ4); return params; } @@ -73,7 +73,7 @@ class MultiFragmentTest : public HiveConnectorTestBase, exec::ExchangeSource::factories().clear(); exec::ExchangeSource::registerFactory(createLocalExchangeSource); configSettings_[core::QueryConfig::kShuffleCompressionKind] = - common::compressionKindToString(GetParam().compressionKind); + velox::common::compressionKindToString(GetParam().compressionKind); } void TearDown() override { @@ -391,7 +391,7 @@ TEST_P(MultiFragmentTest, aggregationMultiKey) { .splits(std::move(finalAggTaskSplits)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults("SELECT c0 % 10, c1 % 2, sum(c2) FROM tmp GROUP BY 1, 2"); for (auto& task : tasks) { @@ -424,7 +424,7 @@ TEST_P(MultiFragmentTest, distributedTableScan) { .split(remoteSplit(leafTaskId)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults("SELECT c2, c1 % 2, c0 % 10 FROM tmp"); verifyExchangeStats(task, 1, 1); @@ -584,7 +584,7 @@ TEST_P(MultiFragmentTest, mergeExchange) { .split(remoteSplit(finalSortTaskId)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults( "SELECT * FROM tmp ORDER BY 1 NULLS LAST", std::vector{0}); @@ -632,7 +632,7 @@ TEST_P(MultiFragmentTest, partitionedOutput) { .split(remoteSplit(leafTaskId)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults("SELECT c0, c1 FROM tmp"); ASSERT_TRUE(waitForTaskCompletion(leafTask.get())) << leafTask->taskId(); @@ -656,7 +656,7 @@ TEST_P(MultiFragmentTest, partitionedOutput) { .split(remoteSplit(leafTaskId)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults("SELECT c3, c0, c2 FROM tmp"); ASSERT_TRUE(waitForTaskCompletion(leafTask.get())) << leafTask->taskId(); @@ -684,7 +684,7 @@ TEST_P(MultiFragmentTest, partitionedOutput) { .split(remoteSplit(leafTaskId)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults("SELECT c0, c1, c2, c3, c4, c3, c2, c1, c0 FROM tmp"); ASSERT_TRUE(waitForTaskCompletion(leafTask.get())) << leafTask->taskId(); @@ -731,7 +731,7 @@ TEST_P(MultiFragmentTest, partitionedOutput) { .splits(std::move(intermediateSplits)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults("SELECT c3, c0, c2 FROM tmp"); verifyExchangeStats(task, kFanout, kFanout); @@ -770,7 +770,7 @@ TEST_P(MultiFragmentTest, partitionedOutput) { .split(remoteSplit(leafTaskId)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .copyResults(pool()); ASSERT_EQ(*result->type(), *ROW({})); ASSERT_EQ(result->size(), numRows); @@ -844,7 +844,7 @@ TEST_P(MultiFragmentTest, noHashPartitionSkew) { .destination(partition) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .maxDrivers(numConsumerDriverThreads) .assertResults(expectedResult); @@ -930,7 +930,7 @@ TEST_P(MultiFragmentTest, noHivePartitionSkew) { .destination(partition) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .maxDrivers(numConsumerDriverThreads) .assertResults(expectedResult); @@ -988,7 +988,7 @@ TEST_P(MultiFragmentTest, partitionedOutputWithLargeInput) { .split(remoteSplit(leafTaskId)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults("SELECT c0, c1, c2, c3, c4 FROM tmp"); ASSERT_TRUE(waitForTaskCompletion(leafTask.get())) << leafTask->taskId() << "state: " << leafTask->state(); @@ -1043,7 +1043,7 @@ TEST_P(MultiFragmentTest, partitionedOutputWithLargeInput) { .splits(std::move(intermediateSplits)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults("SELECT c0, c1, c2, c3, c4 FROM tmp"); ASSERT_TRUE(waitForTaskCompletion(leafTask.get())) << "state: " << leafTask->state(); @@ -1101,7 +1101,7 @@ TEST_P(MultiFragmentTest, broadcast) { .splits(std::move(finalAggTaskSplits)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults("SELECT UNNEST(array[1000, 1000, 1000])"); for (auto& task : tasks) { @@ -1183,7 +1183,7 @@ TEST_P(MultiFragmentTest, roundRobinPartition) { .splits(std::move(collectTaskSplits)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults("SELECT * FROM tmp"); for (auto& task : tasks) { @@ -1251,7 +1251,7 @@ TEST_P(MultiFragmentTest, constantKeys) { .splits(std::move(finalAggTaskSplits)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults( "SELECT 3 * ceil(1000.0 / 7) /* number of null rows */, 1000 + 2 * ceil(1000.0 / 7) /* total number of rows */"); @@ -1318,7 +1318,7 @@ TEST_P(MultiFragmentTest, replicateNullsAndAny) { .splits(std::move(finalAggTaskSplits)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults( "SELECT 3 * ceil(1000.0 / 7) /* number of null rows */, 1000 + 2 * ceil(1000.0 / 7) /* total number of rows */"); @@ -1362,7 +1362,7 @@ TEST_P(MultiFragmentTest, limit) { .split(remoteSplit(leafTaskId)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults( "VALUES (null), (1), (2), (3), (4), (5), (6), (null), (8), (9)"); ASSERT_TRUE(waitForTaskCompletion(task.get())) << task->taskId(); @@ -1403,7 +1403,7 @@ TEST_P(MultiFragmentTest, mergeExchangeOverEmptySources) { .splits(std::move(leafTaskSplits)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults(""); for (auto& task : tasks) { @@ -1505,7 +1505,7 @@ TEST_P(MultiFragmentTest, earlyCompletion) { .splits(std::move(joinTaskSplits)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults("SELECT UNNEST([3, 3, 3, 3, 4, 4, 4, 4])"); for (auto& task : tasks) { @@ -1581,7 +1581,7 @@ TEST_P(MultiFragmentTest, earlyCompletionBroadcast) { .splits(std::move(joinTaskSplits)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults("SELECT UNNEST([10, 10, 10, 10])"); for (auto& task : tasks) { @@ -1664,7 +1664,7 @@ TEST_P(MultiFragmentTest, earlyCompletionMerge) { .splits(std::move(joinTaskSplits)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults("SELECT UNNEST([3, 3, 3, 3, 4, 4, 4, 4])"); for (auto& task : tasks) { @@ -1925,7 +1925,7 @@ TEST_P(MultiFragmentTest, customPlanNodeWithExchangeClient) { CursorParameters params; params.queryConfigs.emplace( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)); + velox::common::compressionKindToString(GetParam().compressionKind)); core::PlanNodeId testNodeId; params.maxDrivers = 1; params.planNode = @@ -2391,7 +2391,7 @@ class DataFetcher { /// of individual pages. PartitionedOutput operator is expected to limit page /// sizes to no more than 1MB give and take 30%. DEBUG_ONLY_TEST_P(MultiFragmentTest, maxBytes) { - if (GetParam().compressionKind != common::CompressionKind_NONE) { + if (GetParam().compressionKind != velox::common::CompressionKind_NONE) { // NOTE: different compression generates different serialized byte size so // only test with no-compression to ease testing.s return; @@ -2776,7 +2776,7 @@ TEST_P(MultiFragmentTest, compression) { .split(remoteSplit(producerTaskId)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .destination(0) .assertResults(expected); @@ -2784,21 +2784,21 @@ TEST_P(MultiFragmentTest, compression) { const auto& consumerPlanStats = consumerTaskStats.at("0"); ASSERT_EQ( consumerPlanStats.customStats.at(Operator::kShuffleCompressionKind).min, - static_cast(GetParam().compressionKind)); + static_cast(GetParam().compressionKind)); ASSERT_EQ( consumerPlanStats.customStats.at(Operator::kShuffleCompressionKind).max, - static_cast(GetParam().compressionKind)); + static_cast(GetParam().compressionKind)); ASSERT_EQ(data->size() * kNumRepeats, consumerPlanStats.outputRows); auto producerTaskStats = exec::toPlanStats(producerTask->taskStats()); const auto& producerStats = producerTaskStats.at("1"); ASSERT_EQ( producerStats.customStats.at(Operator::kShuffleCompressionKind).min, - static_cast(GetParam().compressionKind)); + static_cast(GetParam().compressionKind)); ASSERT_EQ( producerStats.customStats.at(Operator::kShuffleCompressionKind).max, - static_cast(GetParam().compressionKind)); - if (GetParam().compressionKind == common::CompressionKind_NONE) { + static_cast(GetParam().compressionKind)); + if (GetParam().compressionKind == velox::common::CompressionKind_NONE) { ASSERT_EQ( producerStats.customStats.count( IterativeVectorSerializer::kCompressedBytes), @@ -2930,7 +2930,7 @@ TEST_P(MultiFragmentTest, scaledTableScan) { .split(remoteSplit(finalAggTaskId)) .config( core::QueryConfig::kShuffleCompressionKind, - common::compressionKindToString(GetParam().compressionKind)) + velox::common::compressionKindToString(GetParam().compressionKind)) .assertResults( "SELECT c5, max(c0), sum(c1), sum(c2), sum(c3), sum(c4) FROM tmp group by c5"); diff --git a/velox/exec/tests/OperatorTraceTest.cpp b/velox/exec/tests/OperatorTraceTest.cpp index 64a9886f50d5..241ba021d037 100644 --- a/velox/exec/tests/OperatorTraceTest.cpp +++ b/velox/exec/tests/OperatorTraceTest.cpp @@ -49,9 +49,9 @@ class OperatorTraceTest : public HiveConnectorTestBase { serializer::presto::PrestoVectorSerde::registerVectorSerde(); } Type::registerSerDe(); - common::Filter::registerSerDe(); + velox::common::Filter::registerSerDe(); connector::hive::HiveTableHandle::registerSerDe(); - connector::hive::LocationHandle::registerSerDe(); + connector::hive::HiveLocationHandle::registerSerDe(); connector::hive::HiveColumnHandle::registerSerDe(); connector::hive::HiveInsertTableHandle::registerSerDe(); connector::hive::HiveConnectorSplit::registerSerDe(); diff --git a/velox/exec/tests/OrderByTest.cpp b/velox/exec/tests/OrderByTest.cpp index 1f44d3d270c6..223eae424664 100644 --- a/velox/exec/tests/OrderByTest.cpp +++ b/velox/exec/tests/OrderByTest.cpp @@ -43,7 +43,7 @@ namespace facebook::velox::exec::test { namespace { // Returns aggregated spilled stats by 'task'. common::SpillStats spilledStats(const exec::Task& task) { - common::SpillStats spilledStats; + velox::common::SpillStats spilledStats; auto stats = task.taskStats(); for (auto& pipeline : stats.pipelineStats) { for (auto op : pipeline.operatorStats) { diff --git a/velox/exec/tests/PlanNodeSerdeTest.cpp b/velox/exec/tests/PlanNodeSerdeTest.cpp index 54f5d77361bf..b4e2a288c653 100644 --- a/velox/exec/tests/PlanNodeSerdeTest.cpp +++ b/velox/exec/tests/PlanNodeSerdeTest.cpp @@ -37,9 +37,9 @@ class PlanNodeSerdeTest : public testing::Test, parse::registerTypeResolver(); Type::registerSerDe(); - common::Filter::registerSerDe(); + velox::common::Filter::registerSerDe(); connector::hive::HiveTableHandle::registerSerDe(); - connector::hive::LocationHandle::registerSerDe(); + connector::hive::HiveLocationHandle::registerSerDe(); connector::hive::HiveColumnHandle::registerSerDe(); connector::hive::HiveInsertTableHandle::registerSerDe(); connector::hive::HiveInsertFileNameGenerator::registerSerDe(); diff --git a/velox/exec/tests/PrefixSortTest.cpp b/velox/exec/tests/PrefixSortTest.cpp index f404fa05ac4d..8eb2918eefaf 100644 --- a/velox/exec/tests/PrefixSortTest.cpp +++ b/velox/exec/tests/PrefixSortTest.cpp @@ -63,7 +63,7 @@ class PrefixSortTest : public exec::test::OperatorTestBase { const auto maxBytes = PrefixSort::maxRequiredBytes( &rowContainer, compareFlags, - common::PrefixSortConfig{ + velox::common::PrefixSortConfig{ 1024, // Set threshold to 0 to enable prefix-sort in small dataset. 0, @@ -75,7 +75,7 @@ class PrefixSortTest : public exec::test::OperatorTestBase { PrefixSort::sort( &rowContainer, compareFlags, - common::PrefixSortConfig{ + velox::common::PrefixSortConfig{ 1024, // Set threshold to 0 to enable prefix-sort in small dataset. 0, diff --git a/velox/exec/tests/SortBufferTest.cpp b/velox/exec/tests/SortBufferTest.cpp index 93bf32edf4df..f1aa55682c8e 100644 --- a/velox/exec/tests/SortBufferTest.cpp +++ b/velox/exec/tests/SortBufferTest.cpp @@ -66,14 +66,14 @@ class SortBufferTest : public OperatorTestBase, OperatorTestBase::TearDown(); } - common::SpillConfig getSpillConfig( + velox::common::SpillConfig getSpillConfig( const std::string& spillDir, bool enableSpillPrefixSort = true) const { std::optional spillPrefixSortConfig = enableSpillPrefixSort ? std::optional(prefixSortConfig_) : std::nullopt; - return common::SpillConfig( + return velox::common::SpillConfig( [spillDir]() -> const std::string& { return spillDir; }, [&](uint64_t) {}, "0.0.0", @@ -385,7 +385,7 @@ TEST_P(SortBufferTest, batchOutput) { for (const auto& testData : testSettings) { SCOPED_TRACE(testData.debugString()); auto spillDirectory = exec::test::TempDirectoryPath::create(); - auto spillConfig = common::SpillConfig( + auto spillConfig = velox::common::SpillConfig( [&]() -> const std::string& { return spillDirectory->getPath(); }, [&](uint64_t) {}, "0.0.0", @@ -402,7 +402,7 @@ TEST_P(SortBufferTest, batchOutput) { 0, "none", prefixSortConfig_); - folly::Synchronized spillStats; + folly::Synchronized spillStats; auto sortBuffer = std::make_unique( inputType_, sortColumnIndices_, @@ -482,7 +482,7 @@ TEST_P(SortBufferTest, spill) { // memory reservation failure and thus trigger disk spilling. auto spillableReservationGrowthPct = testData.memoryReservationFailure ? 100000 : 100; - auto spillConfig = common::SpillConfig( + auto spillConfig = velox::common::SpillConfig( [&]() -> const std::string& { return spillDirectory->getPath(); }, [&](uint64_t) {}, "0.0.0", @@ -499,7 +499,7 @@ TEST_P(SortBufferTest, spill) { 0, "none", prefixSortConfig_); - folly::Synchronized spillStats; + folly::Synchronized spillStats; auto sortBuffer = std::make_unique( inputType_, sortColumnIndices_, @@ -570,7 +570,7 @@ TEST_P(SortBufferTest, spill) { DEBUG_ONLY_TEST_P(SortBufferTest, spillDuringInput) { auto spillDirectory = exec::test::TempDirectoryPath::create(); const auto spillConfig = getSpillConfig(spillDirectory->getPath()); - folly::Synchronized spillStats; + folly::Synchronized spillStats; auto sortBuffer = std::make_unique( inputType_, sortColumnIndices_, @@ -626,7 +626,7 @@ DEBUG_ONLY_TEST_P(SortBufferTest, spillDuringInput) { DEBUG_ONLY_TEST_P(SortBufferTest, spillDuringOutput) { auto spillDirectory = exec::test::TempDirectoryPath::create(); const auto spillConfig = getSpillConfig(spillDirectory->getPath()); - folly::Synchronized spillStats; + folly::Synchronized spillStats; auto sortBuffer = std::make_unique( inputType_, sortColumnIndices_, @@ -680,7 +680,7 @@ DEBUG_ONLY_TEST_P(SortBufferTest, reserveMemorySortGetOutput) { auto spillDirectory = exec::test::TempDirectoryPath::create(); const auto spillConfig = getSpillConfig(spillDirectory->getPath()); - folly::Synchronized spillStats; + folly::Synchronized spillStats; auto sortBuffer = std::make_unique( inputType_, sortColumnIndices_, @@ -739,7 +739,7 @@ DEBUG_ONLY_TEST_P(SortBufferTest, reserveMemorySort) { "usePrefixSort: {}, spillEnabled: {}, ", usePrefixSort, spillEnabled)); auto spillDirectory = exec::test::TempDirectoryPath::create(); auto spillConfig = getSpillConfig(spillDirectory->getPath(), usePrefixSort); - folly::Synchronized spillStats; + folly::Synchronized spillStats; auto sortBuffer = std::make_unique( inputType_, sortColumnIndices_, @@ -784,7 +784,7 @@ TEST_P(SortBufferTest, emptySpill) { SCOPED_TRACE(fmt::format("hasPostSpillData {}", hasPostSpillData)); auto spillDirectory = exec::test::TempDirectoryPath::create(); auto spillConfig = getSpillConfig(spillDirectory->getPath()); - folly::Synchronized spillStats; + folly::Synchronized spillStats; auto sortBuffer = std::make_unique( inputType_, sortColumnIndices_, diff --git a/velox/exec/tests/SpillTest.cpp b/velox/exec/tests/SpillTest.cpp index 9097abd7f664..629ed0d7c9f1 100644 --- a/velox/exec/tests/SpillTest.cpp +++ b/velox/exec/tests/SpillTest.cpp @@ -55,15 +55,15 @@ class TestRuntimeStatWriter : public BaseRuntimeStatWriter { } // namespace struct TestParam { - const common::CompressionKind compressionKind; + const velox::common::CompressionKind compressionKind; const bool enablePrefixSort; - TestParam(common::CompressionKind _compressionKind, bool _enablePrefixSort) + TestParam(velox::common::CompressionKind _compressionKind, bool _enablePrefixSort) : compressionKind(_compressionKind), enablePrefixSort(_enablePrefixSort) {} TestParam(uint32_t value) - : compressionKind(static_cast(value >> 1)), + : compressionKind(static_cast(value >> 1)), enablePrefixSort(!!(value & 1)) {} uint32_t value() const { @@ -285,7 +285,7 @@ class SpillTest : public ::testing::TestWithParam, spillStats_.wlock()->reset(); const std::optional prefixSortConfig = enablePrefixSort_ - ? std::optional(common::PrefixSortConfig()) + ? std::optional(velox::common::PrefixSortConfig()) : std::nullopt; const int32_t numSortKeys = 1; const auto sortingKeys = SpillState::makeSortingKeys( @@ -413,7 +413,7 @@ class SpillTest : public ::testing::TestWithParam, compareFlags.empty() ? true : compareFlags[0].nullsFirst, compareFlags.empty() ? true : compareFlags[0].ascending)); - const auto prevGStats = common::globalSpillStats(); + const auto prevGStats = velox::common::globalSpillStats(); SpillPartitionIdSet partitionIds = genPartitionIdSet(numPartitions); @@ -438,7 +438,7 @@ class SpillTest : public ::testing::TestWithParam, // NOTE: the following stats are not collected by spill state. ASSERT_EQ(stats.spillFillTimeNanos, 0); ASSERT_EQ(stats.spillSortTimeNanos, 0); - const auto newGStats = common::globalSpillStats(); + const auto newGStats = velox::common::globalSpillStats(); ASSERT_EQ( prevGStats.spilledPartitions + stats.spilledPartitions, newGStats.spilledPartitions); @@ -573,17 +573,17 @@ class SpillTest : public ::testing::TestWithParam, folly::Random::DefaultGenerator rng_; std::shared_ptr tempDir_; memory::MemoryAllocator* allocator_; - common::CompressionKind compressionKind_; + velox::common::CompressionKind compressionKind_; bool enablePrefixSort_; std::vector> values_; folly::F14FastMap> batchesByPartition_; std::string fileNamePrefix_; - folly::Synchronized spillStats_; + folly::Synchronized spillStats_; std::unique_ptr state_; std::unordered_map runtimeStats_; std::unique_ptr statWriter_; - common::UpdateAndCheckSpillLimitCB updateSpilledBytesCb_; + velox::common::UpdateAndCheckSpillLimitCB updateSpilledBytesCb_; }; TEST_P(SpillTest, spillState) { @@ -621,7 +621,7 @@ TEST_P(SpillTest, spillTimestamp) { Timestamp{Timestamp::kMinSeconds, 0}}; const std::optional prefixSortConfig = enablePrefixSort_ - ? std::optional(common::PrefixSortConfig()) + ? std::optional(velox::common::PrefixSortConfig()) : std::nullopt; SpillState state( [&]() -> const std::string& { return tempDirectory->getPath(); }, @@ -1441,7 +1441,7 @@ SpillFiles makeFakeSpillFiles(int32_t numFiles) { tempDir->getPath() + "/Spill_" + std::to_string(fileId), 1024, SpillState::makeSortingKeys(std::vector(1)), - common::CompressionKind_NONE}); + velox::common::CompressionKind_NONE}); } return files; } diff --git a/velox/exec/tests/SpillerBenchmarkBase.h b/velox/exec/tests/SpillerBenchmarkBase.h index a4a4fcc228ea..5700ae42d191 100644 --- a/velox/exec/tests/SpillerBenchmarkBase.h +++ b/velox/exec/tests/SpillerBenchmarkBase.h @@ -71,6 +71,6 @@ class SpillerBenchmarkBase { std::unique_ptr spiller_; // Stats. uint64_t executionTimeUs_{0}; - folly::Synchronized spillStats_; + folly::Synchronized spillStats_; }; } // namespace facebook::velox::exec::test diff --git a/velox/exec/tests/SpillerTest.cpp b/velox/exec/tests/SpillerTest.cpp index e9f435495960..b9afe95371bd 100644 --- a/velox/exec/tests/SpillerTest.cpp +++ b/velox/exec/tests/SpillerTest.cpp @@ -92,14 +92,14 @@ struct TestParam { // Specifies the spill executor pool size. If the size is zero, then spill // write path is executed inline with spiller control code path. int poolSize; - common::CompressionKind compressionKind; + velox::common::CompressionKind compressionKind; bool enablePrefixSort; core::JoinType joinType; TestParam( SpillerType _type, int _poolSize, - common::CompressionKind _compressionKind, + velox::common::CompressionKind _compressionKind, bool _enablePrefixSort, core::JoinType _joinType) : type(_type), @@ -126,8 +126,8 @@ struct TestParamsBuilder { for (int i = 0; i < numSpillerTypes; ++i) { const auto type = static_cast(i); if (typesToExclude.find(type) == typesToExclude.end()) { - common::CompressionKind compressionKind = - static_cast(numSpillerTypes % 6); + velox::common::CompressionKind compressionKind = + static_cast(numSpillerTypes % 6); for (int poolSize : {0, 8}) { params.emplace_back( type, @@ -339,7 +339,7 @@ class SpillerTest : public exec::test::RowContainerTestBase { ascending, makeError)); constexpr int32_t kNumRows = 5'000; - const auto prevGStats = common::globalSpillStats(); + const auto prevGStats = velox::common::globalSpillStats(); setupSpillData(numKeys_, kNumRows, numDuplicates, [&](RowVectorPtr rows) { // Set ordinal so that the sorted order is unambiguous. @@ -407,7 +407,7 @@ class SpillerTest : public exec::test::RowContainerTestBase { ASSERT_GT(stats.spillSerializationTimeNanos, 0); ASSERT_GT(stats.spillWrites, 0); - const auto newGStats = common::globalSpillStats(); + const auto newGStats = velox::common::globalSpillStats(); ASSERT_EQ( prevGStats.spilledFiles + stats.spilledFiles, newGStats.spilledFiles); ASSERT_EQ( @@ -607,14 +607,14 @@ class SpillerTest : public exec::test::RowContainerTestBase { uint64_t maxSpillRunRows = 0, uint64_t readBufferSize = 1 << 20) { static const std::string kBadSpillDirPath = "/bad/path"; - common::GetSpillDirectoryPathCB badSpillDirCb = [&]() -> std::string_view { + velox::common::GetSpillDirectoryPathCB badSpillDirCb = [&]() -> std::string_view { return kBadSpillDirPath; }; - common::GetSpillDirectoryPathCB tempSpillDirCb = [&]() -> std::string_view { + velox::common::GetSpillDirectoryPathCB tempSpillDirCb = [&]() -> std::string_view { return tempDirPath_->getPath(); }; stats_.clear(); - spillStats_ = folly::Synchronized(); + spillStats_ = folly::Synchronized(); spillConfig_.startPartitionBit = hashBits_.begin(); spillConfig_.numPartitionBits = hashBits_.numBits(); @@ -627,7 +627,7 @@ class SpillerTest : public exec::test::RowContainerTestBase { spillConfig_.compressionKind = compressionKind_; enablePrefixSort_ ? spillConfig_.prefixSortConfig = std::optional( - common::PrefixSortConfig()) + velox::common::PrefixSortConfig()) : spillConfig_.prefixSortConfig = std::nullopt; spillConfig_.maxSpillRunRows = maxSpillRunRows; spillConfig_.maxFileSize = targetFileSize; @@ -882,7 +882,7 @@ class SpillerTest : public exec::test::RowContainerTestBase { // them by partition. std::vector> spillers; for (int iter = 0; iter < numSpillers; ++iter) { - const auto prevGStats = common::globalSpillStats(); + const auto prevGStats = velox::common::globalSpillStats(); setupSpillData( numKeys_, (type_ != SpillerType::NO_ROW_CONTAINER) ? numBatchRows * 10 : 0, @@ -963,7 +963,7 @@ class SpillerTest : public exec::test::RowContainerTestBase { ASSERT_EQ(stats.spillFillTimeNanos, 0); } - const auto newGStats = common::globalSpillStats(); + const auto newGStats = velox::common::globalSpillStats(); ASSERT_EQ( prevGStats.spilledFiles + stats.spilledFiles, newGStats.spilledFiles); ASSERT_EQ( @@ -1007,7 +1007,7 @@ class SpillerTest : public exec::test::RowContainerTestBase { // Spilled file stats should be updated after finalizing spiller. if (numAppendBatches > 0) { - ASSERT_GT(common::globalSpillStats().spilledFiles, 0); + ASSERT_GT(velox::common::globalSpillStats().spilledFiles, 0); } } @@ -1203,7 +1203,7 @@ class SpillerTest : public exec::test::RowContainerTestBase { const TestParam param_; const SpillerType type_; const int32_t executorPoolSize_; - const common::CompressionKind compressionKind_; + const velox::common::CompressionKind compressionKind_; const bool enablePrefixSort_; const core::JoinType joinType_; const bool spillProbedFlag_; @@ -1230,8 +1230,8 @@ class SpillerTest : public exec::test::RowContainerTestBase { std::vector> partitions_; std::vector compareFlags_; std::unique_ptr spiller_; - common::SpillConfig spillConfig_; - folly::Synchronized spillStats_; + velox::common::SpillConfig spillConfig_; + folly::Synchronized spillStats_; }; struct AllTypesTestParam { diff --git a/velox/exec/tests/TableEvolutionFuzzerTest.cpp b/velox/exec/tests/TableEvolutionFuzzerTest.cpp index 09f672488cc5..155c78f47bbe 100644 --- a/velox/exec/tests/TableEvolutionFuzzerTest.cpp +++ b/velox/exec/tests/TableEvolutionFuzzerTest.cpp @@ -34,17 +34,17 @@ namespace { void registerFactories(folly::Executor* ioExecutor) { filesystems::registerLocalFileSystem(); - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); auto hiveConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector( TableEvolutionFuzzer::connectorId(), std::make_shared( std::unordered_map()), ioExecutor); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); dwio::common::registerFileSinks(); dwrf::registerDwrfReaderFactory(); dwrf::registerDwrfWriterFactory(); diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp index b4d2b1060b80..a922ba35ba74 100644 --- a/velox/exec/tests/TableScanTest.cpp +++ b/velox/exec/tests/TableScanTest.cpp @@ -100,7 +100,7 @@ class TableScanTest : public HiveConnectorTestBase { std::shared_ptr assertQuery( const PlanNodePtr& plan, - const std::shared_ptr& hiveSplit, + const std::shared_ptr& hiveSplit, const std::string& duckDbSql) { return OperatorTestBase::assertQuery(plan, {hiveSplit}, duckDbSql); } @@ -187,7 +187,7 @@ class TableScanTest : public HiveConnectorTestBase { ASSERT_EQ(n, task->numFinishedDrivers()); } - void testPartitionedTableImpl( + void testPartitionedTableImpl(TypeWithId const std::string& filePath, const TypePtr& partitionType, const std::optional& partitionValue) { @@ -196,7 +196,7 @@ class TableScanTest : public HiveConnectorTestBase { .build(); auto outputType = ROW({"pkey", "c0", "c1"}, {partitionType, BIGINT(), DOUBLE()}); - ConnectorColumnHandleMap assignments = { + connector::common::ConnectorColumnHandleMap assignments = { {"pkey", partitionKey("pkey", partitionType)}, {"c0", regularColumn("c0", BIGINT())}, {"c1", regularColumn("c1", DOUBLE())}}; @@ -402,7 +402,7 @@ DEBUG_ONLY_TEST_F(TableScanTest, pendingCoalescedIoWhenTaskFailed) { TEST_F(TableScanTest, connectorStats) { auto hiveConnector = std::dynamic_pointer_cast( - connector::getConnector(kHiveConnectorId)); + connector::common::getConnector(kHiveConnectorId)); EXPECT_NE(nullptr, hiveConnector); verifyCacheStats(hiveConnector->fileHandleCacheStats(), 0, 0, 0); @@ -479,7 +479,7 @@ TEST_F(TableScanTest, partitionKeyAlias) { writeToFile(filePath->getPath(), vectors); createDuckDbTable(vectors); - ConnectorColumnHandleMap assignments = { + connector::common::ConnectorColumnHandleMap assignments = { {"a", regularColumn("c0", BIGINT())}, {"ds_alias", partitionKey("ds", VARCHAR())}}; @@ -711,11 +711,11 @@ TEST_F(TableScanTest, subfieldPruningRowType) { auto vectors = makeVectors(10, 1'000, rowType); auto filePath = TempFilePath::create(); writeToFile(filePath->getPath(), vectors); - std::vector requiredSubfields; + std::vector requiredSubfields; requiredSubfields.emplace_back("e.c"); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments; assignments["e"] = std::make_shared( "e", @@ -770,11 +770,11 @@ TEST_F(TableScanTest, subfieldPruningRemainingFilterSubfieldsMissing) { auto vectors = makeVectors(10, 1'000, rowType); auto filePath = TempFilePath::create(); writeToFile(filePath->getPath(), vectors); - std::vector requiredSubfields; + std::vector requiredSubfields; requiredSubfields.emplace_back("e.c"); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments; assignments["e"] = std::make_shared( "e", @@ -832,7 +832,7 @@ TEST_F(TableScanTest, subfieldPruningRemainingFilterRootFieldMissing) { writeToFile(filePath->getPath(), vectors); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments; assignments["d"] = std::make_shared( "d", HiveColumnHandle::ColumnType::kRegular, BIGINT(), BIGINT()); @@ -877,12 +877,12 @@ TEST_F(TableScanTest, subfieldPruningRemainingFilterStruct) { SCOPED_TRACE(fmt::format("{} {}", outputColumn, filterColumn)); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments; assignments["d"] = std::make_shared( "d", HiveColumnHandle::ColumnType::kRegular, BIGINT(), BIGINT()); if (outputColumn > kNoOutput) { - std::vector subfields; + std::vector subfields; if (outputColumn == kSubfieldOnly) { subfields.emplace_back("c.b"); } @@ -965,12 +965,12 @@ TEST_F(TableScanTest, subfieldPruningRemainingFilterMap) { SCOPED_TRACE(fmt::format("{} {}", outputColumn, filterColumn)); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments; assignments["a"] = std::make_shared( "a", HiveColumnHandle::ColumnType::kRegular, BIGINT(), BIGINT()); if (outputColumn > kNoOutput) { - std::vector subfields; + std::vector subfields; if (outputColumn == kSubfieldOnly) { subfields.emplace_back("b[1]"); } @@ -1061,13 +1061,13 @@ TEST_F(TableScanTest, subfieldPruningMapType) { auto rowType = asRowType(vectors[0]->type()); auto filePath = TempFilePath::create(); writeToFile(filePath->getPath(), vectors); - std::vector requiredSubfields; + std::vector requiredSubfields; requiredSubfields.emplace_back("c[0]"); requiredSubfields.emplace_back("c[2]"); requiredSubfields.emplace_back("c[4]"); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments; assignments["c"] = std::make_shared( "c", @@ -1149,11 +1149,11 @@ TEST_F(TableScanTest, subfieldPruningArrayType) { auto rowType = asRowType(vectors[0]->type()); auto filePath = TempFilePath::create(); writeToFile(filePath->getPath(), vectors); - std::vector requiredSubfields; + std::vector requiredSubfields; requiredSubfields.emplace_back("c[3]"); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments; assignments["c"] = std::make_shared( "c", @@ -1312,11 +1312,11 @@ TEST_F(TableScanTest, missingColumns) { assertQuery(op, filePaths, "SELECT count(*) FROM tmp WHERE c1 <= 4000.1", 0); // Use missing column 'c1' in 'is null' filter, while not selecting 'c1'. - common::SubfieldFilters filters; - filters[common::Subfield("c1")] = lessThanOrEqualDouble(1050.0, true); + velox::common::SubfieldFilters filters; + filters[velox::common::Subfield("c1")] = lessThanOrEqualDouble(1050.0, true); auto tableHandle = std::make_shared( kHiveConnectorId, "tmp", true, std::move(filters), nullptr, dataColumns); - ConnectorColumnHandleMap assignments; + connector::common::ConnectorColumnHandleMap assignments; assignments["c0"] = regularColumn("c0", BIGINT()); op = PlanBuilder(pool_.get()) .startTableScan() @@ -2157,14 +2157,14 @@ TEST_F(TableScanTest, partitionedTableDateKey) { .partitionKey("pkey", partitionValue) .build(); auto outputType = ROW({"pkey", "c0", "c1"}, {DATE(), BIGINT(), DOUBLE()}); - ConnectorColumnHandleMap assignments = { + connector::common::ConnectorColumnHandleMap assignments = { {"pkey", partitionKey("pkey", DATE())}, {"c0", regularColumn("c0", BIGINT())}, {"c1", regularColumn("c1", DOUBLE())}}; - common::SubfieldFilters filters; + velox::common::SubfieldFilters filters; // pkey > 2020-09-01. - filters[common::Subfield("pkey")] = std::make_unique( + filters[velox::common::Subfield("pkey")] = std::make_unique( 18506, std::numeric_limits::max(), false); auto tableHandle = std::make_shared( @@ -2197,7 +2197,7 @@ TEST_F(TableScanTest, partitionedTableTimestampKey) { .partitionKey("pkey", partitionValue) .build(); - ConnectorColumnHandleMap assignments = { + connector::common::ConnectorColumnHandleMap assignments = { {"pkey", partitionKey("pkey", TIMESTAMP())}, {"c0", regularColumn("c0", BIGINT())}, {"c1", regularColumn("c1", DOUBLE())}}; @@ -2328,7 +2328,7 @@ TEST_F(TableScanTest, partitionedTableTimestampKey) { auto planWithSubfilter = [&](bool asLocalTime) { auto outputType = ROW({"pkey", "c0", "c1"}, {TIMESTAMP(), BIGINT(), DOUBLE()}); - common::SubfieldFilters filters; + velox::common::SubfieldFilters filters; // pkey = 2023-10-27 00:12:35. auto lower = util::fromTimestampString( @@ -2337,7 +2337,7 @@ TEST_F(TableScanTest, partitionedTableTimestampKey) { if (asLocalTime) { lower.toGMT(Timestamp::defaultTimezone()); } - filters[common::Subfield("pkey")] = + filters[velox::common::Subfield("pkey")] = std::make_unique(lower, lower, false); auto tableHandle = std::make_shared( "test-hive", @@ -2525,7 +2525,7 @@ TEST_F(TableScanTest, statsBasedSkipping) { // c0 <= -1 -> whole file should be skipped based on stats auto subfieldFilters = singleSubfieldFilter("c0", lessThanOrEqual(-1)); - ConnectorColumnHandleMap assignments = { + connector::common::ConnectorColumnHandleMap assignments = { {"c1", regularColumn("c1", INTEGER())}}; auto assertQuery = [&](const std::string& query) { @@ -3036,7 +3036,7 @@ TEST_F(TableScanTest, filterPushdown) { createDuckDbTable(vectors); // c1 >= 0 or null and c3 is true - common::SubfieldFilters subfieldFilters = + velox::common::SubfieldFilters subfieldFilters = SubfieldFiltersBuilder() .add("c1", greaterThanOrEqual(0, true)) .add("c3", std::make_unique(true, false)) @@ -3132,7 +3132,7 @@ TEST_F(TableScanTest, path) { // use $path in a filter, but don't project it out auto tableHandle = makeTableHandle( - common::SubfieldFilters{}, + velox::common::SubfieldFilters{}, parseExpr(fmt::format("\"{}\" = '{}'", kPath, pathValue), typeWithPath)); op = PlanBuilder() .startTableScan() @@ -3189,7 +3189,7 @@ TEST_F(TableScanTest, fileSizeAndModifiedTime) { auto filterTest = [&](const std::string& filter) { auto tableHandle = makeTableHandle( - common::SubfieldFilters{}, + velox::common::SubfieldFilters{}, parseExpr(filter, allColumns), "hive_table", allColumns); @@ -3233,7 +3233,7 @@ TEST_F(TableScanTest, bucket) { auto filePaths = makeFilePaths(numBatches); - std::vector> splits; + std::vector> splits; splits.reserve(numBatches); std::vector buckets = {10, 12, 15, 16, 27}; @@ -3322,7 +3322,7 @@ TEST_F(TableScanTest, bucketConversion) { constexpr int kNewNumBuckets = 16; const int selectedBuckets[] = {3, 5, 11}; auto makeSplits = [&] { - std::vector> splits; + std::vector> splits; for (int bucket : selectedBuckets) { std::vector> handles; handles.push_back(makeColumnHandle("c0", INTEGER(), {})); @@ -3404,7 +3404,7 @@ TEST_F(TableScanTest, bucketConversionWithSubfieldPruning) { writeToFile(file->getPath(), {vector}); constexpr int kNewNumBuckets = 16; const int selectedBuckets[] = {3, 5, 11}; - std::vector> splits; + std::vector> splits; for (int bucket : selectedBuckets) { std::vector> handles; handles.push_back(makeColumnHandle("c0", key->type(), {})); @@ -3720,7 +3720,7 @@ TEST_F(TableScanTest, remainingFilter) { "SELECT * FROM tmp WHERE c1 > c0 AND c0 >= 0"); // Remaining filter uses columns that are not used otherwise. - ConnectorColumnHandleMap assignments = { + connector::common::ConnectorColumnHandleMap assignments = { {"c2", regularColumn("c2", DOUBLE())}}; assertQuery( @@ -4330,7 +4330,7 @@ TEST_F(TableScanTest, interleaveLazyEager) { auto eagerFile = TempFilePath::create(); writeToFile(eagerFile->getPath(), rowsWithNulls); - ConnectorColumnHandleMap assignments = { + connector::common::ConnectorColumnHandleMap assignments = { {"c0", regularColumn("c0", column->type())}}; CursorParameters params; params.planNode = PlanBuilder() @@ -5144,7 +5144,7 @@ TEST_F(TableScanTest, varbinaryPartitionKey) { writeToFile(filePath->getPath(), vectors); createDuckDbTable(vectors); - ConnectorColumnHandleMap assignments = { + connector::common::ConnectorColumnHandleMap assignments = { {"a", regularColumn("c0", BIGINT())}, {"ds_alias", partitionKey("ds", VARBINARY())}}; @@ -5191,7 +5191,7 @@ TEST_F(TableScanTest, timestampPartitionKey) { writeToFile(filePath->getPath(), vectors); const auto getSplits = [&]() { - std::vector> splits; + std::vector> splits; for (auto& t : inputs) { splits.push_back( exec::test::HiveConnectorSplitBuilder(filePath->getPath()) @@ -5201,7 +5201,7 @@ TEST_F(TableScanTest, timestampPartitionKey) { return splits; }; - ConnectorColumnHandleMap assignments = { + connector::common::ConnectorColumnHandleMap assignments = { {"t", partitionKey("t", TIMESTAMP())}}; auto plan = PlanBuilder() .startTableScan() @@ -5357,7 +5357,7 @@ TEST_F(TableScanTest, dynamicFilterWithRowIndexColumn) { makeFlatVector(5, folly::identity)}); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments; assignments["a"] = std::make_shared( "a", @@ -5642,7 +5642,7 @@ TEST_F(TableScanTest, rowNumberInRemainingFilter) { auto outputType = ROW({"c0"}, {BIGINT()}); auto remainingFilter = parseExpr("r1 % 2 == 0", ROW({"r1"}, {BIGINT()})); auto tableHandle = - makeTableHandle(common::SubfieldFilters{}, remainingFilter); + makeTableHandle(velox::common::SubfieldFilters{}, remainingFilter); auto plan = PlanBuilder() .startTableScan() .outputType(outputType) diff --git a/velox/exec/tests/TableWriterTest.cpp b/velox/exec/tests/TableWriterTest.cpp index 461d3a124dcb..c5bb9df87683 100644 --- a/velox/exec/tests/TableWriterTest.cpp +++ b/velox/exec/tests/TableWriterTest.cpp @@ -14,3576 +14,3672 @@ * limitations under the License. */ -#include "velox/exec/tests/utils/TableWriterTestBase.h" - -#include "folly/dynamic.h" +#include "velox/exec/TableWriter.h" +#include #include "velox/common/base/Fs.h" #include "velox/common/base/tests/GTestUtils.h" -#include "velox/common/hyperloglog/SparseHll.h" -#include "velox/common/testutil/TestValue.h" -#include "velox/connectors/hive/HiveConfig.h" -#include "velox/connectors/hive/HivePartitionFunction.h" -#include "velox/dwio/common/WriterFactory.h" +#include "velox/connectors/common/ConnectorNames.h" +#include "velox/connectors/common/PluginLoader.h" +#include "velox/exec/PlanBuilder.h" #include "velox/exec/PlanNodeStats.h" -#include "velox/exec/TableWriter.h" #include "velox/exec/tests/utils/AssertQueryBuilder.h" #include "velox/exec/tests/utils/HiveConnectorTestBase.h" -#include "velox/exec/tests/utils/PlanBuilder.h" -#include "velox/exec/tests/utils/TempDirectoryPath.h" -#include "velox/vector/fuzzer/VectorFuzzer.h" - -#include -#include -#include "folly/experimental/EventCount.h" -#include "velox/common/memory/MemoryArbitrator.h" -#include "velox/dwio/common/Options.h" -#include "velox/dwio/dwrf/writer/Writer.h" -#include "velox/exec/tests/utils/ArbitratorTestUtil.h" - -namespace velox::exec::test { -constexpr uint64_t kQueryMemoryCapacity = 512 * MB; - -class BasicTableWriterTestBase : public HiveConnectorTestBase {}; - -TEST_F(BasicTableWriterTestBase, roundTrip) { - vector_size_t size = 1'000; - auto data = makeRowVector({ - makeFlatVector(size, [](auto row) { return row; }), - makeFlatVector( - size, [](auto row) { return row * 2; }, nullEvery(7)), - }); - - auto sourceFilePath = TempFilePath::create(); - writeToFile(sourceFilePath->getPath(), data); - - auto targetDirectoryPath = TempDirectoryPath::create(); - - auto rowType = asRowType(data->type()); - auto plan = PlanBuilder() - .tableScan(rowType) - .tableWrite(targetDirectoryPath->getPath()) - .planNode(); - - auto results = AssertQueryBuilder(plan) - .split(makeHiveConnectorSplit(sourceFilePath->getPath())) - .copyResults(pool()); - ASSERT_EQ(2, results->size()); - - // First column has number of rows written in the first row and nulls in other - // rows. - auto rowCount = results->childAt(TableWriteTraits::kRowCountChannel) - ->as>(); - ASSERT_FALSE(rowCount->isNullAt(0)); - ASSERT_EQ(size, rowCount->valueAt(0)); - ASSERT_TRUE(rowCount->isNullAt(1)); - - // Second column contains details about written files. - auto details = results->childAt(TableWriteTraits::kFragmentChannel) - ->as>(); - ASSERT_TRUE(details->isNullAt(0)); - ASSERT_FALSE(details->isNullAt(1)); - folly::dynamic obj = folly::parseJson(details->valueAt(1)); - - ASSERT_EQ(size, obj["rowCount"].asInt()); - auto fileWriteInfos = obj["fileWriteInfos"]; - ASSERT_EQ(1, fileWriteInfos.size()); - - auto writeFileName = fileWriteInfos[0]["writeFileName"].asString(); - - // Read from 'writeFileName' and verify the data matches the original. - plan = PlanBuilder().tableScan(rowType).planNode(); - - auto copy = AssertQueryBuilder(plan) - .split(makeHiveConnectorSplit(fmt::format( - "{}/{}", targetDirectoryPath->getPath(), writeFileName))) - .copyResults(pool()); - assertEqualResults({data}, {copy}); -} - -// Generates a struct (row), write it as a flap map, and check that it is read -// back as a map. -TEST_F(BasicTableWriterTestBase, structAsMap) { - // Input struct type. - vector_size_t size = 1'000; - auto data = makeRowVector( - {"col1"}, - { - makeRowVector( - // Struct field names are the feature/map keys. - {"1", "2"}, - { - makeFlatVector(size, [](auto row) { return row; }), - makeFlatVector(size, [](auto row) { return row; }), - }), - }); - - // Write it as a flat map. - auto outputType = ROW({"col1"}, {MAP(INTEGER(), INTEGER())}); - auto targetDirectoryPath = TempDirectoryPath::create(); - std::string fileName = "output_file"; - - auto plan = PlanBuilder() - .values({data}) - .tableWrite( - targetDirectoryPath->getPath(), - {}, - 0, - {}, - {}, - dwio::common::FileFormat::DWRF, - {}, - PlanBuilder::kHiveDefaultConnectorId, - { - {"orc.flatten.map", "true"}, - {"orc.map.flat.cols", "0"}, - {"orc.map.flat.cols.struct.keys", "[[\"1\", \"2\"]]"}, - }, - nullptr, - fileName, - common::CompressionKind_NONE, - outputType) - .planNode(); - auto writerResults = AssertQueryBuilder(plan).copyResults(pool()); - - // Check we get the expected map after reading. - auto expected = makeRowVector( - {"col1"}, - { - makeMapVector( - size, - [](auto /*row*/) { return 2; }, - [](auto row) { return row % 2 == 0 ? 2 : 1; }, - [](auto row) { return row / 2; }), - }); - plan = PlanBuilder().tableScan(outputType).planNode(); - AssertQueryBuilder(plan) - .split(makeHiveConnectorSplit( - targetDirectoryPath->getPath() + "/" + fileName)) - .assertResults(expected); -} - -TEST_F(BasicTableWriterTestBase, targetFileName) { - constexpr const char* kFileName = "test.dwrf"; - auto data = makeRowVector({makeFlatVector(10, folly::identity)}); - auto directory = TempDirectoryPath::create(); - auto plan = PlanBuilder() - .values({data}) - .tableWrite( - directory->getPath(), - dwio::common::FileFormat::DWRF, - {}, - nullptr, - kFileName) - .planNode(); - auto results = AssertQueryBuilder(plan).copyResults(pool()); - auto* details = results->childAt(TableWriteTraits::kFragmentChannel) - ->asUnchecked>(); - auto detail = folly::parseJson(details->valueAt(1)); - auto fileWriteInfos = detail["fileWriteInfos"]; - ASSERT_EQ(1, fileWriteInfos.size()); - ASSERT_EQ(fileWriteInfos[0]["writeFileName"].asString(), kFileName); - plan = PlanBuilder().tableScan(asRowType(data->type())).planNode(); - AssertQueryBuilder(plan) - .split(makeHiveConnectorSplit( - fmt::format("{}/{}", directory->getPath(), kFileName))) - .assertResults(data); -} - -class PartitionedTableWriterTest - : public TableWriterTestBase, - public testing::WithParamInterface { - public: - PartitionedTableWriterTest() : TableWriterTestBase(GetParam()) {} - - static std::vector getTestParams() { - std::vector testParams; - const std::vector multiDriverOptions = {false, true}; - std::vector fileFormats = {FileFormat::DWRF}; - if (hasWriterFactory(FileFormat::PARQUET)) { - fileFormats.push_back(FileFormat::PARQUET); - } - for (bool multiDrivers : multiDriverOptions) { - for (FileFormat fileFormat : fileFormats) { - for (bool scaleWriter : {false, true}) { - testParams.push_back(TestParam{ - fileFormat, - TestMode::kPartitioned, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kPartitioned, - CommitStrategy::kTaskCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kBucketed, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kBucketed, - CommitStrategy::kTaskCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kBucketed, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kPrestoNative, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kBucketed, - CommitStrategy::kTaskCommit, - HiveBucketProperty::Kind::kPrestoNative, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - } - } - } - return testParams; - } -}; - -class UnpartitionedTableWriterTest - : public TableWriterTestBase, - public testing::WithParamInterface { - public: - UnpartitionedTableWriterTest() : TableWriterTestBase(GetParam()) {} - - static std::vector getTestParams() { - std::vector testParams; - const std::vector multiDriverOptions = {false, true}; - std::vector fileFormats = {FileFormat::DWRF}; - if (hasWriterFactory(FileFormat::PARQUET)) { - fileFormats.push_back(FileFormat::PARQUET); - } - for (bool multiDrivers : multiDriverOptions) { - for (FileFormat fileFormat : fileFormats) { - for (bool scaleWriter : {false, true}) { - testParams.push_back(TestParam{ - fileFormat, - TestMode::kUnpartitioned, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_NONE, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kUnpartitioned, - CommitStrategy::kTaskCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_NONE, - scaleWriter} - .value); - } - } - } - return testParams; - } -}; - -class BucketedTableOnlyWriteTest - : public TableWriterTestBase, - public testing::WithParamInterface { - public: - BucketedTableOnlyWriteTest() : TableWriterTestBase(GetParam()) {} - - static std::vector getTestParams() { - std::vector testParams; - const std::vector multiDriverOptions = {false, true}; - std::vector fileFormats = {FileFormat::DWRF}; - if (hasWriterFactory(FileFormat::PARQUET)) { - fileFormats.push_back(FileFormat::PARQUET); - } - const std::vector bucketModes = { - TestMode::kBucketed, TestMode::kOnlyBucketed}; - for (bool multiDrivers : multiDriverOptions) { - for (FileFormat fileFormat : fileFormats) { - for (auto bucketMode : bucketModes) { - testParams.push_back(TestParam{ - fileFormat, - bucketMode, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_ZSTD, - /*scaleWriter=*/false} - .value); - testParams.push_back(TestParam{ - fileFormat, - bucketMode, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kHiveCompatible, - true, - multiDrivers, - CompressionKind_ZSTD, - /*scaleWriter=*/false} - .value); - testParams.push_back(TestParam{ - fileFormat, - bucketMode, - CommitStrategy::kTaskCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_ZSTD, - /*scaleWriter=*/false} - .value); - testParams.push_back(TestParam{ - fileFormat, - bucketMode, - CommitStrategy::kTaskCommit, - HiveBucketProperty::Kind::kHiveCompatible, - true, - multiDrivers, - CompressionKind_ZSTD, - /*scaleWriter=*/false} - .value); - testParams.push_back(TestParam{ - fileFormat, - bucketMode, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kPrestoNative, - false, - multiDrivers, - CompressionKind_ZSTD, - /*scaleWriter=*/false} - .value); - testParams.push_back(TestParam{ - fileFormat, - bucketMode, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kPrestoNative, - true, - multiDrivers, - CompressionKind_ZSTD, - /*scaleWriter=*/false} - .value); - testParams.push_back(TestParam{ - fileFormat, - bucketMode, - CommitStrategy::kTaskCommit, - HiveBucketProperty::Kind::kPrestoNative, - false, - multiDrivers, - CompressionKind_ZSTD, - /*scaleWriter=*/false} - .value); - testParams.push_back(TestParam{ - fileFormat, - bucketMode, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kPrestoNative, - true, - multiDrivers, - CompressionKind_ZSTD, - /*scaleWriter=*/false} - .value); - } - } - } - return testParams; - } -}; - -class BucketSortOnlyTableWriterTest - : public TableWriterTestBase, - public testing::WithParamInterface { - public: - BucketSortOnlyTableWriterTest() : TableWriterTestBase(GetParam()) {} - - static std::vector getTestParams() { - std::vector testParams; - const std::vector multiDriverOptions = {false, true}; - std::vector fileFormats = {FileFormat::DWRF}; - if (hasWriterFactory(FileFormat::PARQUET)) { - fileFormats.push_back(FileFormat::PARQUET); - } - const std::vector bucketModes = { - TestMode::kBucketed, TestMode::kOnlyBucketed}; - for (bool multiDrivers : multiDriverOptions) { - for (FileFormat fileFormat : fileFormats) { - for (auto bucketMode : bucketModes) { - testParams.push_back(TestParam{ - fileFormat, - bucketMode, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kHiveCompatible, - true, - multiDrivers, - facebook::velox::common::CompressionKind_ZSTD, - /*scaleWriter=*/false} - .value); - testParams.push_back(TestParam{ - fileFormat, - bucketMode, - CommitStrategy::kTaskCommit, - HiveBucketProperty::Kind::kHiveCompatible, - true, - multiDrivers, - facebook::velox::common::CompressionKind_NONE, - /*scaleWriter=*/false} - .value); - } - } - } - return testParams; - } -}; - -class PartitionedWithoutBucketTableWriterTest - : public TableWriterTestBase, - public testing::WithParamInterface { - public: - PartitionedWithoutBucketTableWriterTest() : TableWriterTestBase(GetParam()) {} - - static std::vector getTestParams() { - std::vector testParams; - const std::vector multiDriverOptions = {false, true}; - std::vector fileFormats = {FileFormat::DWRF}; - if (hasWriterFactory(FileFormat::PARQUET)) { - fileFormats.push_back(FileFormat::PARQUET); - } - for (bool multiDrivers : multiDriverOptions) { - for (FileFormat fileFormat : fileFormats) { - for (bool scaleWriter : {false, true}) { - testParams.push_back(TestParam{ - fileFormat, - TestMode::kPartitioned, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kPartitioned, - CommitStrategy::kTaskCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - true, - CompressionKind_ZSTD, - scaleWriter} - .value); - } - } - } - return testParams; - } -}; +#include "velox/exec/tests/utils/TableWriterTestBase.h" -class AllTableWriterTest : public TableWriterTestBase, - public testing::WithParamInterface { - public: - AllTableWriterTest() : TableWriterTestBase(GetParam()) {} +using namespace facebook::velox; +using namespace facebook::velox::exec::test; - static std::vector getTestParams() { - std::vector testParams; - const std::vector multiDriverOptions = {false, true}; - std::vector fileFormats = {FileFormat::DWRF}; - if (hasWriterFactory(FileFormat::PARQUET)) { - fileFormats.push_back(FileFormat::PARQUET); - } - for (bool multiDrivers : multiDriverOptions) { - for (FileFormat fileFormat : fileFormats) { - for (bool scaleWriter : {false, true}) { - testParams.push_back(TestParam{ - fileFormat, - TestMode::kUnpartitioned, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kUnpartitioned, - CommitStrategy::kTaskCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kPartitioned, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kPartitioned, - CommitStrategy::kTaskCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kBucketed, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kBucketed, - CommitStrategy::kTaskCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kBucketed, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kPrestoNative, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kBucketed, - CommitStrategy::kTaskCommit, - HiveBucketProperty::Kind::kPrestoNative, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kOnlyBucketed, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kOnlyBucketed, - CommitStrategy::kTaskCommit, - HiveBucketProperty::Kind::kHiveCompatible, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kOnlyBucketed, - CommitStrategy::kNoCommit, - HiveBucketProperty::Kind::kPrestoNative, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - testParams.push_back(TestParam{ - fileFormat, - TestMode::kOnlyBucketed, - CommitStrategy::kTaskCommit, - HiveBucketProperty::Kind::kPrestoNative, - false, - multiDrivers, - CompressionKind_ZSTD, - scaleWriter} - .value); - } - } - } - return testParams; +class TableWriterTest : public TableWriterTestBase { + protected: + void SetUp() override { + OperatorTestBase::SetUp(); + facebook::velox::connector::common::loadConnectorPlugins( + {"libvelox_hive_connector_plugin.so"}); + HiveConnectorTestBase::initFactory(); + TableWriterTestBase::initFactory(); } }; -// Runs a pipeline with read + filter + project (with substr) + write. -TEST_P(AllTableWriterTest, scanFilterProjectWrite) { - auto filePaths = makeFilePaths(5); - auto vectors = makeVectors(filePaths.size(), 500); - for (int i = 0; i < filePaths.size(); i++) { - writeToFile(filePaths[i]->getPath(), vectors[i]); - } - - createDuckDbTable(vectors); - - auto outputDirectory = TempDirectoryPath::create(); - - auto planBuilder = PlanBuilder(); - auto project = planBuilder.tableScan(rowType_).filter("c2 <> 0").project( - {"c0", "c1", "c3", "c5", "c2 + c3", "substr(c5, 1, 1)"}); - - auto intputTypes = project.planNode()->outputType()->children(); - std::vector tableColumnNames = { - "c0", "c1", "c3", "c5", "c2_plus_c3", "substr_c5"}; - const auto outputType = - ROW(std::move(tableColumnNames), std::move(intputTypes)); - - auto plan = createInsertPlan( - project, - outputType, - outputDirectory->getPath(), - partitionedBy_, - bucketProperty_, - compressionKind_, - getNumWriters(), - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - - assertQueryWithWriterConfigs( - plan, filePaths, "SELECT count(*) FROM tmp WHERE c2 <> 0"); - - // To test the correctness of the generated output, - // We create a new plan that only read that file and then - // compare that against a duckDB query that runs the whole query. - if (partitionedBy_.size() > 0) { - auto newOutputType = getNonPartitionsColumns(partitionedBy_, outputType); - assertQuery( - PlanBuilder().tableScan(newOutputType).planNode(), - makeHiveConnectorSplits(outputDirectory), - "SELECT c3, c5, c2 + c3, substr(c5, 1, 1) FROM tmp WHERE c2 <> 0"); - verifyTableWriterOutput(outputDirectory->getPath(), newOutputType, false); - } else { - assertQuery( - PlanBuilder().tableScan(outputType).planNode(), - makeHiveConnectorSplits(outputDirectory), - "SELECT c0, c1, c3, c5, c2 + c3, substr(c5, 1, 1) FROM tmp WHERE c2 <> 0"); - verifyTableWriterOutput(outputDirectory->getPath(), outputType, false); - } -} - -TEST_P(AllTableWriterTest, renameAndReorderColumns) { - auto filePaths = makeFilePaths(5); - auto vectors = makeVectors(filePaths.size(), 500); - for (int i = 0; i < filePaths.size(); ++i) { - writeToFile(filePaths[i]->getPath(), vectors[i]); - } - - createDuckDbTable(vectors); - - auto outputDirectory = TempDirectoryPath::create(); - - if (testMode_ == TestMode::kPartitioned || testMode_ == TestMode::kBucketed) { - const std::vector partitionBy = {"x", "y"}; - setPartitionBy(partitionBy); - } - if (testMode_ == TestMode::kBucketed || - testMode_ == TestMode::kOnlyBucketed) { - setBucketProperty( - bucketProperty_->kind(), - bucketProperty_->bucketCount(), - {"z", "v"}, - {REAL(), VARCHAR()}, - {}); - } - - auto inputRowType = - ROW({"c2", "c5", "c4", "c1", "c0", "c3"}, - {SMALLINT(), VARCHAR(), DOUBLE(), INTEGER(), BIGINT(), REAL()}); - - setTableSchema( - ROW({"u", "v", "w", "x", "y", "z"}, - {SMALLINT(), VARCHAR(), DOUBLE(), INTEGER(), BIGINT(), REAL()})); - - auto plan = createInsertPlan( - PlanBuilder().tableScan(rowType_), - inputRowType, - tableSchema_, - outputDirectory->getPath(), - partitionedBy_, - bucketProperty_, - compressionKind_, - getNumWriters(), - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - - assertQueryWithWriterConfigs(plan, filePaths, "SELECT count(*) FROM tmp"); - - if (partitionedBy_.size() > 0) { - auto newOutputType = getNonPartitionsColumns(partitionedBy_, tableSchema_); - HiveConnectorTestBase::assertQuery( - PlanBuilder().tableScan(newOutputType).planNode(), - makeHiveConnectorSplits(outputDirectory), - "SELECT c2, c5, c4, c3 FROM tmp"); - - verifyTableWriterOutput(outputDirectory->getPath(), newOutputType, false); - } else { - HiveConnectorTestBase::assertQuery( - PlanBuilder().tableScan(tableSchema_).planNode(), - makeHiveConnectorSplits(outputDirectory), - "SELECT c2, c5, c4, c1, c0, c3 FROM tmp"); - - verifyTableWriterOutput(outputDirectory->getPath(), tableSchema_, false); - } -} - -// Runs a pipeline with read + write. -TEST_P(AllTableWriterTest, directReadWrite) { - auto filePaths = makeFilePaths(5); - auto vectors = makeVectors(filePaths.size(), 200); - for (int i = 0; i < filePaths.size(); i++) { - writeToFile(filePaths[i]->getPath(), vectors[i]); - } - - createDuckDbTable(vectors); - - auto outputDirectory = TempDirectoryPath::create(); - auto plan = createInsertPlan( - PlanBuilder().tableScan(rowType_), - rowType_, - outputDirectory->getPath(), - partitionedBy_, - bucketProperty_, - compressionKind_, - getNumWriters(), - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - - assertQuery(plan, filePaths, "SELECT count(*) FROM tmp"); - - // To test the correctness of the generated output, - // We create a new plan that only read that file and then - // compare that against a duckDB query that runs the whole query. - - if (partitionedBy_.size() > 0) { - auto newOutputType = getNonPartitionsColumns(partitionedBy_, tableSchema_); - assertQuery( - PlanBuilder().tableScan(newOutputType).planNode(), - makeHiveConnectorSplits(outputDirectory), - "SELECT c2, c3, c4, c5 FROM tmp"); - rowType_ = newOutputType; - verifyTableWriterOutput(outputDirectory->getPath(), rowType_); - } else { - assertQuery( - PlanBuilder().tableScan(rowType_).planNode(), - makeHiveConnectorSplits(outputDirectory), - "SELECT * FROM tmp"); - - verifyTableWriterOutput(outputDirectory->getPath(), rowType_); - } -} - -// Tests writing constant vectors. -TEST_P(AllTableWriterTest, constantVectors) { - vector_size_t size = 1'000; - - // Make constant vectors of various types with null and non-null values. - auto vector = makeConstantVector(size); - - createDuckDbTable({vector}); - - auto outputDirectory = TempDirectoryPath::create(); - auto op = createInsertPlan( - PlanBuilder().values({vector}), - rowType_, - outputDirectory->getPath(), - partitionedBy_, - bucketProperty_, - compressionKind_, - getNumWriters(), - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - - assertQuery(op, fmt::format("SELECT {}", size)); - - if (partitionedBy_.size() > 0) { - auto newOutputType = getNonPartitionsColumns(partitionedBy_, tableSchema_); - assertQuery( - PlanBuilder().tableScan(newOutputType).planNode(), - makeHiveConnectorSplits(outputDirectory), - "SELECT c2, c3, c4, c5 FROM tmp"); - rowType_ = newOutputType; - verifyTableWriterOutput(outputDirectory->getPath(), rowType_); - } else { - assertQuery( - PlanBuilder().tableScan(rowType_).planNode(), - makeHiveConnectorSplits(outputDirectory), - "SELECT * FROM tmp"); - - verifyTableWriterOutput(outputDirectory->getPath(), rowType_); - } -} - -TEST_P(AllTableWriterTest, emptyInput) { - auto outputDirectory = TempDirectoryPath::create(); - auto vector = makeConstantVector(0); - auto op = createInsertPlan( - PlanBuilder().values({vector}), - rowType_, - outputDirectory->getPath(), - partitionedBy_, - bucketProperty_, - compressionKind_, - getNumWriters(), - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - - assertQuery(op, "SELECT 0"); -} - -TEST_P(AllTableWriterTest, commitStrategies) { - auto filePaths = makeFilePaths(5); - auto vectors = makeVectors(filePaths.size(), 100); - - createDuckDbTable(vectors); - - // Test the kTaskCommit commit strategy writing to one dot-prefixed - // temporary file. - { - SCOPED_TRACE(CommitStrategy::kTaskCommit); - auto outputDirectory = TempDirectoryPath::create(); - auto plan = createInsertPlan( - PlanBuilder().values(vectors), - rowType_, - outputDirectory->getPath(), - partitionedBy_, - bucketProperty_, - compressionKind_, - getNumWriters(), - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - - assertQuery(plan, "SELECT count(*) FROM tmp"); - - if (partitionedBy_.size() > 0) { - auto newOutputType = - getNonPartitionsColumns(partitionedBy_, tableSchema_); - assertQuery( - PlanBuilder().tableScan(newOutputType).planNode(), - makeHiveConnectorSplits(outputDirectory), - "SELECT c2, c3, c4, c5 FROM tmp"); - auto originalRowType = rowType_; - rowType_ = newOutputType; - verifyTableWriterOutput(outputDirectory->getPath(), rowType_); - rowType_ = originalRowType; - } else { - assertQuery( - PlanBuilder().tableScan(rowType_).planNode(), - makeHiveConnectorSplits(outputDirectory), - "SELECT * FROM tmp"); - verifyTableWriterOutput(outputDirectory->getPath(), rowType_); - } - } - // Test kNoCommit commit strategy writing to non-temporary files. - { - SCOPED_TRACE(CommitStrategy::kNoCommit); - auto outputDirectory = TempDirectoryPath::create(); - setCommitStrategy(CommitStrategy::kNoCommit); - auto plan = createInsertPlan( - PlanBuilder().values(vectors), - rowType_, - outputDirectory->getPath(), - partitionedBy_, - bucketProperty_, - compressionKind_, - getNumWriters(), - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - - assertQuery(plan, "SELECT count(*) FROM tmp"); - - if (partitionedBy_.size() > 0) { - auto newOutputType = - getNonPartitionsColumns(partitionedBy_, tableSchema_); - assertQuery( - PlanBuilder().tableScan(newOutputType).planNode(), - makeHiveConnectorSplits(outputDirectory), - "SELECT c2, c3, c4, c5 FROM tmp"); - rowType_ = newOutputType; - verifyTableWriterOutput(outputDirectory->getPath(), rowType_); - } else { - assertQuery( - PlanBuilder().tableScan(rowType_).planNode(), - makeHiveConnectorSplits(outputDirectory), - "SELECT * FROM tmp"); - verifyTableWriterOutput(outputDirectory->getPath(), rowType_); - } - } -} - -TEST_P(PartitionedTableWriterTest, specialPartitionName) { - const int32_t numPartitions = 50; - const int32_t numBatches = 2; - - const auto rowType = - ROW({"c0", "p0", "p1", "c1", "c3", "c5"}, - {INTEGER(), INTEGER(), VARCHAR(), BIGINT(), REAL(), VARCHAR()}); - const std::vector partitionKeys = {"p0", "p1"}; - const std::vector partitionTypes = {INTEGER(), VARCHAR()}; - - const std::vector charsToEscape = { - '"', - '#', - '%', - '\'', - '*', - '/', - ':', - '=', - '?', - '\\', - '\x7F', - '{', - '[', - ']', - '^'}; - ASSERT_GE(numPartitions, charsToEscape.size()); - std::vector vectors = makeBatches(numBatches, [&](auto) { - return makeRowVector( - rowType->names(), - { - makeFlatVector( - numPartitions, [&](auto row) { return row + 100; }), - makeFlatVector( - numPartitions, [&](auto row) { return row; }), - makeFlatVector( - numPartitions, - [&](auto row) { - // special character - return StringView::makeInline( - fmt::format("str_{}{}", row, charsToEscape.at(row % 15))); - }), - makeFlatVector( - numPartitions, [&](auto row) { return row + 1000; }), - makeFlatVector( - numPartitions, [&](auto row) { return row + 33.23; }), - makeFlatVector( - numPartitions, - [&](auto row) { - return StringView::makeInline( - fmt::format("bucket_{}", row * 3)); - }), - }); - }); - createDuckDbTable(vectors); - - auto inputFilePaths = makeFilePaths(numBatches); - for (int i = 0; i < numBatches; i++) { - writeToFile(inputFilePaths[i]->getPath(), vectors[i]); - } - - auto outputDirectory = TempDirectoryPath::create(); - auto plan = createInsertPlan( - PlanBuilder().tableScan(rowType), - rowType, - outputDirectory->getPath(), - partitionKeys, - bucketProperty_, - compressionKind_, - getNumWriters(), - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - - auto task = assertQuery(plan, inputFilePaths, "SELECT count(*) FROM tmp"); - - std::set actualPartitionDirectories = - getLeafSubdirectories(outputDirectory->getPath()); - - std::set expectedPartitionDirectories; - const std::vector expectedCharsAfterEscape = { - "%22", - "%23", - "%25", - "%27", - "%2A", - "%2F", - "%3A", - "%3D", - "%3F", - "%5C", - "%7F", - "%7B", - "%5B", - "%5D", - "%5E"}; - for (auto i = 0; i < numPartitions; ++i) { - // url encoded - auto partitionName = fmt::format( - "p0={}/p1=str_{}{}", i, i, expectedCharsAfterEscape.at(i % 15)); - expectedPartitionDirectories.emplace( - fs::path(outputDirectory->getPath()) / partitionName); - } - EXPECT_EQ(actualPartitionDirectories, expectedPartitionDirectories); -} - -TEST_P(PartitionedTableWriterTest, multiplePartitions) { - int32_t numPartitions = 50; - int32_t numBatches = 2; - - auto rowType = - ROW({"c0", "p0", "p1", "c1", "c3", "c5"}, - {INTEGER(), INTEGER(), VARCHAR(), BIGINT(), REAL(), VARCHAR()}); - std::vector partitionKeys = {"p0", "p1"}; - std::vector partitionTypes = {INTEGER(), VARCHAR()}; - - std::vector vectors = makeBatches(numBatches, [&](auto) { - return makeRowVector( - rowType->names(), - { - makeFlatVector( - numPartitions, [&](auto row) { return row + 100; }), - makeFlatVector( - numPartitions, [&](auto row) { return row; }), - makeFlatVector( - numPartitions, - [&](auto row) { - return StringView::makeInline(fmt::format("str_{}", row)); - }), - makeFlatVector( - numPartitions, [&](auto row) { return row + 1000; }), - makeFlatVector( - numPartitions, [&](auto row) { return row + 33.23; }), - makeFlatVector( - numPartitions, - [&](auto row) { - return StringView::makeInline( - fmt::format("bucket_{}", row * 3)); - }), - }); - }); - createDuckDbTable(vectors); - - auto inputFilePaths = makeFilePaths(numBatches); - for (int i = 0; i < numBatches; i++) { - writeToFile(inputFilePaths[i]->getPath(), vectors[i]); - } - - auto outputDirectory = TempDirectoryPath::create(); - auto plan = createInsertPlan( - PlanBuilder().tableScan(rowType), - rowType, - outputDirectory->getPath(), - partitionKeys, - bucketProperty_, - compressionKind_, - getNumWriters(), - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - - auto task = assertQuery(plan, inputFilePaths, "SELECT count(*) FROM tmp"); - - // Verify that there is one partition directory for each partition. - std::set actualPartitionDirectories = - getLeafSubdirectories(outputDirectory->getPath()); - - std::set expectedPartitionDirectories; - std::set partitionNames; - for (auto i = 0; i < numPartitions; i++) { - auto partitionName = fmt::format("p0={}/p1=str_{}", i, i); - partitionNames.emplace(partitionName); - expectedPartitionDirectories.emplace( - fs::path(outputDirectory->getPath()) / partitionName); - } - EXPECT_EQ(actualPartitionDirectories, expectedPartitionDirectories); - - // Verify distribution of records in partition directories. - auto iterPartitionDirectory = actualPartitionDirectories.begin(); - auto iterPartitionName = partitionNames.begin(); - auto newOutputType = getNonPartitionsColumns(partitionKeys, rowType); - while (iterPartitionDirectory != actualPartitionDirectories.end()) { - assertQuery( - PlanBuilder().tableScan(newOutputType).planNode(), - makeHiveConnectorSplits(*iterPartitionDirectory), - fmt::format( - "SELECT c0, c1, c3, c5 FROM tmp WHERE {}", - partitionNameToPredicate(*iterPartitionName, partitionTypes))); - // In case of unbucketed partitioned table, one single file is written to - // each partition directory for Hive connector. - if (testMode_ == TestMode::kPartitioned) { - ASSERT_EQ(countRecursiveFiles(*iterPartitionDirectory), 1); - } else { - ASSERT_GE(countRecursiveFiles(*iterPartitionDirectory), 1); - } - - ++iterPartitionDirectory; - ++iterPartitionName; - } -} - -TEST_P(PartitionedTableWriterTest, singlePartition) { - const int32_t numBatches = 2; - auto rowType = - ROW({"c0", "p0", "c3", "c5"}, {VARCHAR(), BIGINT(), REAL(), VARCHAR()}); - std::vector partitionKeys = {"p0"}; - - // Partition vector is constant vector. - std::vector vectors = makeBatches(numBatches, [&](auto) { - return makeRowVector( - rowType->names(), - {makeFlatVector( - 1'000, - [&](auto row) { - return StringView::makeInline(fmt::format("str_{}", row)); - }), - makeConstant((int64_t)365, 1'000), - makeFlatVector(1'000, [&](auto row) { return row + 33.23; }), - makeFlatVector(1'000, [&](auto row) { - return StringView::makeInline(fmt::format("bucket_{}", row * 3)); - })}); - }); - createDuckDbTable(vectors); - - auto inputFilePaths = makeFilePaths(numBatches); - for (int i = 0; i < numBatches; i++) { - writeToFile(inputFilePaths[i]->getPath(), vectors[i]); - } - - auto outputDirectory = TempDirectoryPath::create(); - const int numWriters = getNumWriters(); - auto plan = createInsertPlan( - PlanBuilder().tableScan(rowType), - rowType, - outputDirectory->getPath(), - partitionKeys, - bucketProperty_, - compressionKind_, - numWriters, - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - - auto task = assertQueryWithWriterConfigs( - plan, inputFilePaths, "SELECT count(*) FROM tmp"); - - std::set partitionDirectories = - getLeafSubdirectories(outputDirectory->getPath()); - - // Verify only a single partition directory is created. - ASSERT_EQ(partitionDirectories.size(), 1); - EXPECT_EQ( - *partitionDirectories.begin(), - fs::path(outputDirectory->getPath()) / "p0=365"); - - // Verify all data is written to the single partition directory. - auto newOutputType = getNonPartitionsColumns(partitionKeys, rowType); - assertQuery( - PlanBuilder().tableScan(newOutputType).planNode(), - makeHiveConnectorSplits(outputDirectory), - "SELECT c0, c3, c5 FROM tmp"); - - // In case of unbucketed partitioned table, one single file is written to - // each partition directory for Hive connector. - if (testMode_ == TestMode::kPartitioned) { - ASSERT_LE(countRecursiveFiles(*partitionDirectories.begin()), numWriters); - } else { - ASSERT_GE(countRecursiveFiles(*partitionDirectories.begin()), numWriters); - } -} - -TEST_P(PartitionedWithoutBucketTableWriterTest, fromSinglePartitionToMultiple) { - auto rowType = ROW({"c0", "c1"}, {BIGINT(), BIGINT()}); - setDataTypes(rowType); - std::vector partitionKeys = {"c0"}; - - // Partition vector is constant vector. - std::vector vectors; - // The initial vector has the same partition key value; - vectors.push_back(makeRowVector( - rowType->names(), - {makeFlatVector(1'000, [&](auto /*unused*/) { return 1; }), - makeFlatVector(1'000, [&](auto row) { return row + 1; })})); - // The second vector has different partition key value. - vectors.push_back(makeRowVector( - rowType->names(), - {makeFlatVector(1'000, [&](auto row) { return row * 234 % 30; }), - makeFlatVector(1'000, [&](auto row) { return row + 1; })})); - createDuckDbTable(vectors); - - auto outputDirectory = TempDirectoryPath::create(); - auto plan = createInsertPlan( - PlanBuilder().values(vectors), - rowType, - outputDirectory->getPath(), - partitionKeys, +TEST_F(TableWriterTest, basicWriteWithBucketing) { + auto rowType = ROW({"a", "b"}, {BIGINT(), VARCHAR()}); + std::vector partitionedBy = {"a"}; + auto bucketProp = std::make_shared( + partitionedBy, + std::vector{BIGINT()}, + std::vector>{}); + auto location = std::make_shared( + "/tmp/out", "/tmp/out", connector::common::LocationHandle::TableType::kNew); + auto insertHandle = TableWriterTestBase::makeInsertTableHandle( + {"a", "b"}, + {BIGINT(), VARCHAR()}, + partitionedBy, + bucketProp, + location, + dwio::common::FileFormat::PARQUET, + std::nullopt, + {}, nullptr, - compressionKind_, - numTableWriterCount_); - - assertQueryWithWriterConfigs(plan, "SELECT count(*) FROM tmp"); - - auto newOutputType = getNonPartitionsColumns(partitionKeys, rowType); - assertQuery( - PlanBuilder().tableScan(newOutputType).planNode(), - makeHiveConnectorSplits(outputDirectory), - "SELECT c1 FROM tmp"); -} - -TEST_P(PartitionedTableWriterTest, maxPartitions) { - SCOPED_TRACE(testParam_.toString()); - const int32_t maxPartitions = 100; - const int32_t numPartitions = - testMode_ == TestMode::kBucketed ? 1 : maxPartitions + 1; - if (testMode_ == TestMode::kBucketed) { - setBucketProperty( - testParam_.bucketKind(), - 1000, - bucketProperty_->bucketedBy(), - bucketProperty_->bucketedTypes(), - bucketProperty_->sortedBy()); - } - - auto rowType = ROW({"p0", "c3", "c5"}, {BIGINT(), REAL(), VARCHAR()}); - std::vector partitionKeys = {"p0"}; - - RowVectorPtr vector; - if (testMode_ == TestMode::kPartitioned) { - vector = makeRowVector( - rowType->names(), - {makeFlatVector(numPartitions, [&](auto row) { return row; }), - makeFlatVector( - numPartitions, [&](auto row) { return row + 33.23; }), - makeFlatVector(numPartitions, [&](auto row) { - return StringView::makeInline(fmt::format("bucket_{}", row * 3)); - })}); - } else { - vector = makeRowVector( - rowType->names(), - {makeFlatVector(4'000, [&](auto /*unused*/) { return 0; }), - makeFlatVector(4'000, [&](auto row) { return row + 33.23; }), - makeFlatVector(4'000, [&](auto row) { - return StringView::makeInline(fmt::format("bucket_{}", row * 3)); - })}); - }; - - auto outputDirectory = TempDirectoryPath::create(); - auto plan = createInsertPlan( - PlanBuilder().values({vector}), - rowType, - outputDirectory->getPath(), - partitionKeys, - bucketProperty_, - compressionKind_, - getNumWriters(), - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - - if (testMode_ == TestMode::kPartitioned) { - VELOX_ASSERT_THROW( - AssertQueryBuilder(plan) - .connectorSessionProperty( - kHiveConnectorId, - HiveConfig::kMaxPartitionsPerWritersSession, - folly::to(maxPartitions)) - .copyResults(pool()), - fmt::format( - "Exceeded limit of {} distinct partitions.", maxPartitions)); - } else { - VELOX_ASSERT_THROW( - AssertQueryBuilder(plan) - .connectorSessionProperty( - kHiveConnectorId, - HiveConfig::kMaxPartitionsPerWritersSession, - folly::to(maxPartitions)) - .copyResults(pool()), - "Exceeded open writer limit"); - } -} - -// Test TableWriter does not create a file if input is empty. -TEST_P(AllTableWriterTest, writeNoFile) { - auto outputDirectory = TempDirectoryPath::create(); - auto plan = createInsertPlan( - PlanBuilder().tableScan(rowType_).filter("false"), - rowType_, - outputDirectory->getPath()); - - auto execute = [&](const std::shared_ptr& plan, - std::shared_ptr queryCtx) { - CursorParameters params; - params.planNode = plan; - params.queryCtx = queryCtx; - readCursor(params, [&](TaskCursor* taskCursor) { - if (taskCursor->noMoreSplits()) { - return; - } - taskCursor->task()->noMoreSplits("0"); - taskCursor->setNoMoreSplits(); - }); - }; - - execute(plan, core::QueryCtx::create(executor_.get())); - ASSERT_TRUE(fs::is_empty(outputDirectory->getPath())); -} - -TEST_P(UnpartitionedTableWriterTest, differentCompression) { - std::vector compressions{ - CompressionKind_NONE, - CompressionKind_ZLIB, - CompressionKind_SNAPPY, - CompressionKind_LZO, - CompressionKind_ZSTD, - CompressionKind_LZ4, - CompressionKind_GZIP, - CompressionKind_MAX}; - - for (auto compressionKind : compressions) { - auto input = makeVectors(10, 10); - auto outputDirectory = TempDirectoryPath::create(); - if (compressionKind == CompressionKind_MAX) { - VELOX_ASSERT_THROW( - createInsertPlan( - PlanBuilder().values(input), - rowType_, - outputDirectory->getPath(), - {}, - nullptr, - compressionKind, - numTableWriterCount_, - connector::hive::LocationHandle::TableType::kNew), - "Unsupported compression type: CompressionKind_MAX"); - return; - } - auto plan = createInsertPlan( - PlanBuilder().values(input), - rowType_, - outputDirectory->getPath(), - {}, - nullptr, - compressionKind, - numTableWriterCount_, - connector::hive::LocationHandle::TableType::kNew); - - // currently we don't support any compression in PARQUET format - if (fileFormat_ == FileFormat::PARQUET && - compressionKind != CompressionKind_NONE) { - continue; - } - if (compressionKind == CompressionKind_NONE || - compressionKind == CompressionKind_ZLIB || - compressionKind == CompressionKind_ZSTD) { - auto result = AssertQueryBuilder(plan) - .config( - QueryConfig::kTaskWriterCount, - std::to_string(numTableWriterCount_)) - .copyResults(pool()); - assertEqualResults( - {makeRowVector({makeConstant(100, 1)})}, {result}); - } else { - VELOX_ASSERT_THROW( - AssertQueryBuilder(plan) - .config( - QueryConfig::kTaskWriterCount, - std::to_string(numTableWriterCount_)) - .copyResults(pool()), - "Unsupported compression type:"); - } - } -} - -TEST_P(UnpartitionedTableWriterTest, runtimeStatsCheck) { - // The runtime stats test only applies for dwrf file format. - if (fileFormat_ != dwio::common::FileFormat::DWRF) { - return; - } - struct { - int numInputVectors; - std::string maxStripeSize; - int expectedNumStripes; - - std::string debugString() const { - return fmt::format( - "numInputVectors: {}, maxStripeSize: {}, expectedNumStripes: {}", - numInputVectors, - maxStripeSize, - expectedNumStripes); - } - } testSettings[] = { - {10, "1GB", 1}, - {1, "1GB", 1}, - {2, "1GB", 1}, - {10, "1B", 10}, - {2, "1B", 2}, - {1, "1B", 1}}; - - for (const auto& testData : testSettings) { - SCOPED_TRACE(testData.debugString()); - auto rowType = ROW({"c0", "c1"}, {VARCHAR(), BIGINT()}); - - VectorFuzzer::Options options; - options.nullRatio = 0.0; - options.vectorSize = 1; - options.stringLength = 1L << 20; - VectorFuzzer fuzzer(options, pool()); - - std::vector vectors; - for (int i = 0; i < testData.numInputVectors; ++i) { - vectors.push_back(fuzzer.fuzzInputRow(rowType)); - } - - createDuckDbTable(vectors); - - auto outputDirectory = TempDirectoryPath::create(); - auto plan = createInsertPlan( - PlanBuilder().values(vectors), - rowType, - outputDirectory->getPath(), - {}, - nullptr, - compressionKind_, - 1, - connector::hive::LocationHandle::TableType::kNew); - const std::shared_ptr task = - AssertQueryBuilder(plan, duckDbQueryRunner_) - .config(QueryConfig::kTaskWriterCount, std::to_string(1)) - .connectorSessionProperty( - kHiveConnectorId, - dwrf::Config::kOrcWriterMaxStripeSizeSession, - testData.maxStripeSize) - .assertResults("SELECT count(*) FROM tmp"); - auto stats = task->taskStats().pipelineStats.front().operatorStats; - if (testData.maxStripeSize == "1GB") { - ASSERT_GT( - stats[1].memoryStats.peakTotalMemoryReservation, - testData.numInputVectors * options.stringLength); - } - ASSERT_EQ( - stats[1].runtimeStats["stripeSize"].count, testData.expectedNumStripes); - ASSERT_EQ(stats[1].runtimeStats[TableWriter::kNumWrittenFiles].sum, 1); - ASSERT_EQ(stats[1].runtimeStats[TableWriter::kNumWrittenFiles].count, 1); - ASSERT_GE(stats[1].runtimeStats[TableWriter::kWriteIOTime].sum, 0); - ASSERT_EQ(stats[1].runtimeStats[TableWriter::kWriteIOTime].count, 1); - } -} - -TEST_P(UnpartitionedTableWriterTest, immutableSettings) { - struct { - connector::hive::LocationHandle::TableType dataType; - bool immutablePartitionsEnabled; - bool expectedInsertSuccees; - - std::string debugString() const { - return fmt::format( - "dataType:{}, immutablePartitionsEnabled:{}, operationSuccess:{}", - dataType, - immutablePartitionsEnabled, - expectedInsertSuccees); - } - } testSettings[] = { - {connector::hive::LocationHandle::TableType::kNew, true, true}, - {connector::hive::LocationHandle::TableType::kNew, false, true}, - {connector::hive::LocationHandle::TableType::kExisting, true, false}, - {connector::hive::LocationHandle::TableType::kExisting, false, true}}; - - for (auto testData : testSettings) { - SCOPED_TRACE(testData.debugString()); - std::unordered_map propFromFile{ - {"hive.immutable-partitions", - testData.immutablePartitionsEnabled ? "true" : "false"}}; - std::shared_ptr config{ - std::make_shared(std::move(propFromFile))}; - resetHiveConnector(config); - - auto input = makeVectors(10, 10); - auto outputDirectory = TempDirectoryPath::create(); - auto plan = createInsertPlan( - PlanBuilder().values(input), - rowType_, - outputDirectory->getPath(), - {}, - nullptr, - CompressionKind_NONE, - numTableWriterCount_, - testData.dataType); - - if (!testData.expectedInsertSuccees) { - VELOX_ASSERT_THROW( - AssertQueryBuilder(plan).copyResults(pool()), - "Unpartitioned Hive tables are immutable."); - } else { - auto result = AssertQueryBuilder(plan) - .config( - QueryConfig::kTaskWriterCount, - std::to_string(numTableWriterCount_)) - .copyResults(pool()); - assertEqualResults( - {makeRowVector({makeConstant(100, 1)})}, {result}); - } - } -} - -TEST_P(BucketedTableOnlyWriteTest, bucketCountLimit) { - SCOPED_TRACE(testParam_.toString()); - auto input = makeVectors(1, 100); - createDuckDbTable(input); - struct { - uint32_t bucketCount; - bool expectedError; - - std::string debugString() const { - return fmt::format( - "bucketCount:{} expectedError:{}", bucketCount, expectedError); - } - } testSettings[] = { - {1, false}, - {3, false}, - {HiveDataSink::maxBucketCount() - 1, false}, - {HiveDataSink::maxBucketCount(), true}, - {HiveDataSink::maxBucketCount() + 1, true}, - {HiveDataSink::maxBucketCount() * 2, true}}; - for (const auto& testData : testSettings) { - SCOPED_TRACE(testData.debugString()); - auto outputDirectory = TempDirectoryPath::create(); - setBucketProperty( - bucketProperty_->kind(), - testData.bucketCount, - bucketProperty_->bucketedBy(), - bucketProperty_->bucketedTypes(), - bucketProperty_->sortedBy()); - auto plan = createInsertPlan( - PlanBuilder().values({input}), - rowType_, - outputDirectory->getPath(), - partitionedBy_, - bucketProperty_, - compressionKind_, - getNumWriters(), - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - if (testData.expectedError) { - VELOX_ASSERT_THROW( - AssertQueryBuilder(plan) - .connectorSessionProperty( - kHiveConnectorId, - HiveConfig::kMaxPartitionsPerWritersSession, - // Make sure we have a sufficient large writer limit. - folly::to(testData.bucketCount * 2)) - .copyResults(pool()), - "bucketCount exceeds the limit"); - } else { - assertQueryWithWriterConfigs(plan, "SELECT count(*) FROM tmp"); - - if (partitionedBy_.size() > 0) { - auto newOutputType = - getNonPartitionsColumns(partitionedBy_, tableSchema_); - assertQuery( - PlanBuilder().tableScan(newOutputType).planNode(), - makeHiveConnectorSplits(outputDirectory), - "SELECT c2, c3, c4, c5 FROM tmp"); - auto originalRowType = rowType_; - rowType_ = newOutputType; - verifyTableWriterOutput(outputDirectory->getPath(), rowType_); - rowType_ = originalRowType; - } else { - assertQuery( - PlanBuilder().tableScan(rowType_).planNode(), - makeHiveConnectorSplits(outputDirectory), - "SELECT * FROM tmp"); - verifyTableWriterOutput(outputDirectory->getPath(), rowType_); - } - } - } -} - -TEST_P(BucketedTableOnlyWriteTest, mismatchedBucketTypes) { - SCOPED_TRACE(testParam_.toString()); - auto input = makeVectors(1, 100); - createDuckDbTable(input); - auto outputDirectory = TempDirectoryPath::create(); - std::vector badBucketedBy = bucketProperty_->bucketedTypes(); - const auto oldType = badBucketedBy[0]; - badBucketedBy[0] = VARCHAR(); - setBucketProperty( - bucketProperty_->kind(), - bucketProperty_->bucketCount(), - bucketProperty_->bucketedBy(), - badBucketedBy, - bucketProperty_->sortedBy()); - auto plan = createInsertPlan( - PlanBuilder().values({input}), - rowType_, - outputDirectory->getPath(), - partitionedBy_, - bucketProperty_, - compressionKind_, - getNumWriters(), - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - VELOX_ASSERT_THROW( - AssertQueryBuilder(plan).copyResults(pool()), - fmt::format( - "Input column {} type {} doesn't match bucket type {}", - bucketProperty_->bucketedBy()[0], - oldType->toString(), - bucketProperty_->bucketedTypes()[0])); -} - -TEST_P(AllTableWriterTest, tableWriteOutputCheck) { - SCOPED_TRACE(testParam_.toString()); - if (!testParam_.multiDrivers() || - testParam_.testMode() != TestMode::kUnpartitioned) { - return; - } - auto input = makeVectors(10, 100); - createDuckDbTable(input); - auto outputDirectory = TempDirectoryPath::create(); - auto plan = createInsertPlan( - PlanBuilder().values({input}), - rowType_, - outputDirectory->getPath(), - partitionedBy_, - bucketProperty_, - compressionKind_, - getNumWriters(), - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_, - false); - - auto result = runQueryWithWriterConfigs(plan); - auto writtenRowVector = result->childAt(TableWriteTraits::kRowCountChannel) - ->asFlatVector(); - auto fragmentVector = result->childAt(TableWriteTraits::kFragmentChannel) - ->asFlatVector(); - auto commitContextVector = result->childAt(TableWriteTraits::kContextChannel) - ->asFlatVector(); - const int64_t expectedRows = 10 * 100; - std::vector writeFiles; - int64_t numRows{0}; - for (int i = 0; i < result->size(); ++i) { - if (testParam_.multiDrivers()) { - ASSERT_FALSE(commitContextVector->isNullAt(i)); - if (!fragmentVector->isNullAt(i)) { - ASSERT_TRUE(writtenRowVector->isNullAt(i)); - } - } else { - if (i == 0) { - ASSERT_TRUE(fragmentVector->isNullAt(i)); - } else { - ASSERT_TRUE(writtenRowVector->isNullAt(i)); - ASSERT_FALSE(fragmentVector->isNullAt(i)); - } - ASSERT_FALSE(commitContextVector->isNullAt(i)); - } - if (!fragmentVector->isNullAt(i)) { - ASSERT_FALSE(fragmentVector->isNullAt(i)); - folly::dynamic obj = folly::parseJson(fragmentVector->valueAt(i)); - if (testMode_ == TestMode::kUnpartitioned) { - ASSERT_EQ(obj["targetPath"], outputDirectory->getPath()); - ASSERT_EQ(obj["writePath"], outputDirectory->getPath()); - } else { - std::string partitionDirRe; - for (const auto& partitionBy : partitionedBy_) { - partitionDirRe += fmt::format("/{}=.+", partitionBy); - } - ASSERT_TRUE(RE2::FullMatch( - obj["targetPath"].asString(), - fmt::format("{}{}", outputDirectory->getPath(), partitionDirRe))) - << obj["targetPath"].asString(); - ASSERT_TRUE(RE2::FullMatch( - obj["writePath"].asString(), - fmt::format("{}{}", outputDirectory->getPath(), partitionDirRe))) - << obj["writePath"].asString(); - } - numRows += obj["rowCount"].asInt(); - ASSERT_EQ(obj["updateMode"].asString(), "NEW"); - - ASSERT_TRUE(obj["fileWriteInfos"].isArray()); - ASSERT_EQ(obj["fileWriteInfos"].size(), 1); - folly::dynamic writerInfoObj = obj["fileWriteInfos"][0]; - const std::string writeFileName = - writerInfoObj["writeFileName"].asString(); - writeFiles.push_back(writeFileName); - const std::string targetFileName = - writerInfoObj["targetFileName"].asString(); - const std::string writeFileFullPath = - obj["writePath"].asString() + "/" + writeFileName; - std::filesystem::path path{writeFileFullPath}; - const auto actualFileSize = fs::file_size(path); - ASSERT_EQ(obj["onDiskDataSizeInBytes"].asInt(), actualFileSize); - ASSERT_GT(obj["inMemoryDataSizeInBytes"].asInt(), 0); - ASSERT_EQ(writerInfoObj["fileSize"], actualFileSize); - if (commitStrategy_ == CommitStrategy::kNoCommit) { - ASSERT_EQ(writeFileName, targetFileName); - } else { - const std::string kParquetSuffix = ".parquet"; - if (folly::StringPiece(targetFileName).endsWith(kParquetSuffix)) { - // Remove the .parquet suffix. - auto trimmedFilename = targetFileName.substr( - 0, targetFileName.size() - kParquetSuffix.size()); - ASSERT_TRUE(writeFileName.find(trimmedFilename) != std::string::npos); - } else { - ASSERT_TRUE(writeFileName.find(targetFileName) != std::string::npos); - } - } - } - if (!commitContextVector->isNullAt(i)) { - ASSERT_TRUE(RE2::FullMatch( - commitContextVector->valueAt(i).getString(), - fmt::format(".*{}.*", commitStrategyToString(commitStrategy_)))) - << commitContextVector->valueAt(i); - } - } - ASSERT_EQ(numRows, expectedRows); - if (testMode_ == TestMode::kUnpartitioned) { - ASSERT_GT(writeFiles.size(), 0); - ASSERT_LE(writeFiles.size(), numTableWriterCount_); - } - auto diskFiles = listAllFiles(outputDirectory->getPath()); - std::sort(diskFiles.begin(), diskFiles.end()); - std::sort(writeFiles.begin(), writeFiles.end()); - ASSERT_EQ(diskFiles, writeFiles) - << "\nwrite files: " << folly::join(",", writeFiles) - << "\ndisk files: " << folly::join(",", diskFiles); - // Verify the utilities provided by table writer traits. - ASSERT_EQ(TableWriteTraits::getRowCount(result), 10 * 100); - auto obj = TableWriteTraits::getTableCommitContext(result); - ASSERT_EQ( - obj[TableWriteTraits::kCommitStrategyContextKey], - commitStrategyToString(commitStrategy_)); - ASSERT_EQ(obj[TableWriteTraits::klastPageContextKey], true); - ASSERT_EQ(obj[TableWriteTraits::kLifeSpanContextKey], "TaskWide"); -} - -TEST_P(AllTableWriterTest, columnStatsDataTypes) { - auto rowType = - ROW({"c0", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8"}, - {BIGINT(), - INTEGER(), - SMALLINT(), - REAL(), - DOUBLE(), - VARCHAR(), - BOOLEAN(), - MAP(DATE(), BIGINT()), - ARRAY(BIGINT())}); - setDataTypes(rowType); - std::vector input; - input.push_back(makeRowVector( - rowType_->names(), - { - makeFlatVector(1'000, [&](auto row) { return 1; }), - makeFlatVector(1'000, [&](auto row) { return 1; }), - makeFlatVector(1'000, [&](auto row) { return row; }), - makeFlatVector(1'000, [&](auto row) { return row + 33.23; }), - makeFlatVector(1'000, [&](auto row) { return row + 33.23; }), - makeFlatVector( - 1'000, - [&](auto row) { - return StringView(std::to_string(row).c_str()); - }), - makeFlatVector(1'000, [&](auto row) { return true; }), - makeMapVector( - 1'000, - [](auto /*row*/) { return 5; }, - [](auto row) { return row; }, - [](auto row) { return row * 3; }), - makeArrayVector( - 1'000, - [](auto /*row*/) { return 5; }, - [](auto row) { return row * 3; }), - })); - createDuckDbTable(input); - auto outputDirectory = TempDirectoryPath::create(); - - std::vector groupingKeyFields; - for (int i = 0; i < partitionedBy_.size(); ++i) { - groupingKeyFields.emplace_back(std::make_shared( - partitionTypes_.at(i), partitionedBy_.at(i))); - } - - // aggregation node - core::TypedExprPtr intInputField = - std::make_shared(SMALLINT(), "c2"); - auto minCallExpr = std::make_shared( - SMALLINT(), std::vector{intInputField}, "min"); - auto maxCallExpr = std::make_shared( - SMALLINT(), std::vector{intInputField}, "max"); - auto distinctCountCallExpr = std::make_shared( - VARCHAR(), - std::vector{intInputField}, - "approx_distinct"); - - core::TypedExprPtr strInputField = - std::make_shared(VARCHAR(), "c5"); - auto maxDataSizeCallExpr = std::make_shared( - BIGINT(), - std::vector{strInputField}, - "max_data_size_for_stats"); - auto sumDataSizeCallExpr = std::make_shared( - BIGINT(), - std::vector{strInputField}, - "sum_data_size_for_stats"); - - core::TypedExprPtr boolInputField = - std::make_shared(BOOLEAN(), "c6"); - auto countCallExpr = std::make_shared( - BIGINT(), std::vector{boolInputField}, "count"); - auto countIfCallExpr = std::make_shared( - BIGINT(), std::vector{boolInputField}, "count_if"); - - core::TypedExprPtr mapInputField = - std::make_shared( - MAP(DATE(), BIGINT()), "c7"); - auto countMapCallExpr = std::make_shared( - BIGINT(), std::vector{mapInputField}, "count"); - auto sumDataSizeMapCallExpr = std::make_shared( - BIGINT(), - std::vector{mapInputField}, - "sum_data_size_for_stats"); - - core::TypedExprPtr arrayInputField = - std::make_shared( - MAP(DATE(), BIGINT()), "c7"); - auto countArrayCallExpr = std::make_shared( - BIGINT(), std::vector{mapInputField}, "count"); - auto sumDataSizeArrayCallExpr = std::make_shared( - BIGINT(), - std::vector{mapInputField}, - "sum_data_size_for_stats"); - - const std::vector aggregateNames = { - "min", - "max", - "approx_distinct", - "max_data_size_for_stats", - "sum_data_size_for_stats", - "count", - "count_if", - "count", - "sum_data_size_for_stats", - "count", - "sum_data_size_for_stats", - }; - - auto makeAggregate = [](const auto& callExpr) { - std::vector rawInputTypes; - for (const auto& input : callExpr->inputs()) { - rawInputTypes.push_back(input->type()); - } - return core::AggregationNode::Aggregate{ - callExpr, - rawInputTypes, - nullptr, // mask - {}, // sortingKeys - {} // sortingOrders - }; - }; - - std::vector aggregates = { - makeAggregate(minCallExpr), - makeAggregate(maxCallExpr), - makeAggregate(distinctCountCallExpr), - makeAggregate(maxDataSizeCallExpr), - makeAggregate(sumDataSizeCallExpr), - makeAggregate(countCallExpr), - makeAggregate(countIfCallExpr), - makeAggregate(countMapCallExpr), - makeAggregate(sumDataSizeMapCallExpr), - makeAggregate(countArrayCallExpr), - makeAggregate(sumDataSizeArrayCallExpr), - }; - const auto aggregationNode = std::make_shared( - core::PlanNodeId(), - core::AggregationNode::Step::kPartial, - groupingKeyFields, - std::vector{}, - aggregateNames, - aggregates, - false, // ignoreNullKeys - PlanBuilder().values({input}).planNode()); - + true); auto plan = PlanBuilder() - .values({input}) - .addNode(addTableWriter( - rowType_, - rowType_->names(), - aggregationNode, - std::make_shared( - kHiveConnectorId, - makeHiveInsertTableHandle( - rowType_->names(), - rowType_->children(), - partitionedBy_, - nullptr, - makeLocationHandle(outputDirectory->getPath()))), - false, - CommitStrategy::kNoCommit)) - .planNode(); - - // the result is in format of : row/fragments/context/[partition]/[stats] - int nextColumnStatsIndex = 3 + partitionedBy_.size(); - const RowVectorPtr result = AssertQueryBuilder(plan).copyResults(pool()); - auto minStatsVector = - result->childAt(nextColumnStatsIndex++)->asFlatVector(); - ASSERT_EQ(minStatsVector->valueAt(0), 0); - const auto maxStatsVector = - result->childAt(nextColumnStatsIndex++)->asFlatVector(); - ASSERT_EQ(maxStatsVector->valueAt(0), 999); - const auto distinctCountStatsVector = - result->childAt(nextColumnStatsIndex++)->asFlatVector(); - HashStringAllocator allocator{pool_.get()}; - DenseHll denseHll{ - std::string(distinctCountStatsVector->valueAt(0)).c_str(), &allocator}; - ASSERT_EQ(denseHll.cardinality(), 1000); - const auto maxDataSizeStatsVector = - result->childAt(nextColumnStatsIndex++)->asFlatVector(); - ASSERT_EQ(maxDataSizeStatsVector->valueAt(0), 7); - const auto sumDataSizeStatsVector = - result->childAt(nextColumnStatsIndex++)->asFlatVector(); - ASSERT_EQ(sumDataSizeStatsVector->valueAt(0), 6890); - const auto countStatsVector = - result->childAt(nextColumnStatsIndex++)->asFlatVector(); - ASSERT_EQ(countStatsVector->valueAt(0), 1000); - const auto countIfStatsVector = - result->childAt(nextColumnStatsIndex++)->asFlatVector(); - ASSERT_EQ(countIfStatsVector->valueAt(0), 1000); - const auto countMapStatsVector = - result->childAt(nextColumnStatsIndex++)->asFlatVector(); - ASSERT_EQ(countMapStatsVector->valueAt(0), 1000); - const auto sumDataSizeMapStatsVector = - result->childAt(nextColumnStatsIndex++)->asFlatVector(); - ASSERT_EQ(sumDataSizeMapStatsVector->valueAt(0), 64000); - const auto countArrayStatsVector = - result->childAt(nextColumnStatsIndex++)->asFlatVector(); - ASSERT_EQ(countArrayStatsVector->valueAt(0), 1000); - const auto sumDataSizeArrayStatsVector = - result->childAt(nextColumnStatsIndex++)->asFlatVector(); - ASSERT_EQ(sumDataSizeArrayStatsVector->valueAt(0), 64000); -} - -TEST_P(AllTableWriterTest, columnStats) { - auto input = makeVectors(1, 100); - createDuckDbTable(input); - auto outputDirectory = TempDirectoryPath::create(); - - // 1. standard columns - std::vector output = { - "numWrittenRows", "fragment", "tableCommitContext"}; - std::vector types = {BIGINT(), VARBINARY(), VARBINARY()}; - std::vector groupingKeys; - // 2. partition columns - for (int i = 0; i < partitionedBy_.size(); i++) { - groupingKeys.emplace_back( - std::make_shared( - partitionTypes_.at(i), partitionedBy_.at(i))); - output.emplace_back(partitionedBy_.at(i)); - types.emplace_back(partitionTypes_.at(i)); - } - // 3. stats columns - output.emplace_back("min"); - types.emplace_back(BIGINT()); - const auto writerOutputType = ROW(std::move(output), std::move(types)); - - // aggregation node - auto aggregationNode = generateAggregationNode( - "c0", - groupingKeys, - core::AggregationNode::Step::kPartial, - PlanBuilder().values({input}).planNode()); - - auto plan = PlanBuilder() - .values({input}) - .addNode(addTableWriter( - rowType_, - rowType_->names(), - aggregationNode, - std::make_shared( - kHiveConnectorId, - makeHiveInsertTableHandle( - rowType_->names(), - rowType_->children(), - partitionedBy_, - bucketProperty_, - makeLocationHandle(outputDirectory->getPath()))), - false, - commitStrategy_)) - .planNode(); - - auto result = AssertQueryBuilder(plan).copyResults(pool()); - auto rowVector = result->childAt(0)->asFlatVector(); - auto fragmentVector = result->childAt(1)->asFlatVector(); - auto columnStatsVector = - result->childAt(3 + partitionedBy_.size())->asFlatVector(); - - std::vector writeFiles; - - // For partitioned, expected result is as follows: - // Row Fragment Context partition c1_min_value - // null null x partition1 0 - // null null x partition2 10 - // null null x partition3 15 - // count null x null null - // null partition1_update x null null - // null partition1_update x null null - // null partition2_update x null null - // null partition2_update x null null - // null partition3_update x null null - // - // Note that we can have multiple same partition_update, they're for - // different files, but for stats, we would only have one record for each - // partition - // - // For unpartitioned, expected result is: - // Row Fragment Context partition c1_min_value - // null null x 0 - // count null x null null - // null update x null null - - int countRow = 0; - while (!columnStatsVector->isNullAt(countRow)) { - countRow++; - } - for (int i = 0; i < result->size(); ++i) { - if (i < countRow) { - ASSERT_FALSE(columnStatsVector->isNullAt(i)); - ASSERT_TRUE(rowVector->isNullAt(i)); - ASSERT_TRUE(fragmentVector->isNullAt(i)); - } else if (i == countRow) { - ASSERT_TRUE(columnStatsVector->isNullAt(i)); - ASSERT_FALSE(rowVector->isNullAt(i)); - ASSERT_TRUE(fragmentVector->isNullAt(i)); - } else { - ASSERT_TRUE(columnStatsVector->isNullAt(i)); - ASSERT_TRUE(rowVector->isNullAt(i)); - ASSERT_FALSE(fragmentVector->isNullAt(i)); - } - } -} - -TEST_P(AllTableWriterTest, columnStatsWithTableWriteMerge) { - auto input = makeVectors(1, 100); - createDuckDbTable(input); - auto outputDirectory = TempDirectoryPath::create(); - - // 1. standard columns - std::vector output = { - "numWrittenRows", "fragment", "tableCommitContext"}; - std::vector types = {BIGINT(), VARBINARY(), VARBINARY()}; - std::vector groupingKeys; - // 2. partition columns - for (int i = 0; i < partitionedBy_.size(); i++) { - groupingKeys.emplace_back( - std::make_shared( - partitionTypes_.at(i), partitionedBy_.at(i))); - output.emplace_back(partitionedBy_.at(i)); - types.emplace_back(partitionTypes_.at(i)); - } - // 3. stats columns - output.emplace_back("min"); - types.emplace_back(BIGINT()); - const auto writerOutputType = ROW(std::move(output), std::move(types)); - - // aggregation node - auto aggregationNode = generateAggregationNode( - "c0", - groupingKeys, - core::AggregationNode::Step::kPartial, - PlanBuilder().values({input}).planNode()); - - auto tableWriterPlan = PlanBuilder().values({input}).addNode(addTableWriter( - rowType_, - rowType_->names(), - aggregationNode, - std::make_shared( - kHiveConnectorId, - makeHiveInsertTableHandle( - rowType_->names(), - rowType_->children(), - partitionedBy_, - bucketProperty_, - makeLocationHandle(outputDirectory->getPath()))), - false, - commitStrategy_)); - - auto mergeAggregationNode = generateAggregationNode( - "min", - groupingKeys, - core::AggregationNode::Step::kIntermediate, - std::move(tableWriterPlan.planNode())); - - auto finalPlan = tableWriterPlan.capturePlanNodeId(tableWriteNodeId_) - .localPartition(std::vector{}) - .tableWriteMerge(std::move(mergeAggregationNode)) - .planNode(); - - auto result = AssertQueryBuilder(finalPlan).copyResults(pool()); - auto rowVector = result->childAt(0)->asFlatVector(); - auto fragmentVector = result->childAt(1)->asFlatVector(); - auto columnStatsVector = - result->childAt(3 + partitionedBy_.size())->asFlatVector(); - - std::vector writeFiles; - - // For partitioned, expected result is as follows: - // Row Fragment Context partition c1_min_value - // null null x partition1 0 - // null null x partition2 10 - // null null x partition3 15 - // count null x null null - // null partition1_update x null null - // null partition1_update x null null - // null partition2_update x null null - // null partition2_update x null null - // null partition3_update x null null - // - // Note that we can have multiple same partition_update, they're for - // different files, but for stats, we would only have one record for each - // partition - // - // For unpartitioned, expected result is: - // Row Fragment Context partition c1_min_value - // null null x 0 - // count null x null null - // null update x null null - - int statsRow = 0; - while (columnStatsVector->isNullAt(statsRow) && statsRow < result->size()) { - ++statsRow; - } - for (int i = 1; i < result->size(); ++i) { - if (i < statsRow) { - ASSERT_TRUE(rowVector->isNullAt(i)); - ASSERT_FALSE(fragmentVector->isNullAt(i)); - ASSERT_TRUE(columnStatsVector->isNullAt(i)); - } else if (i < result->size() - 1) { - ASSERT_TRUE(rowVector->isNullAt(i)); - ASSERT_TRUE(fragmentVector->isNullAt(i)); - ASSERT_FALSE(columnStatsVector->isNullAt(i)); - } else { - ASSERT_FALSE(rowVector->isNullAt(i)); - ASSERT_TRUE(fragmentVector->isNullAt(i)); - ASSERT_TRUE(columnStatsVector->isNullAt(i)); - } - } -} - -// TODO: add partitioned table write update mode tests and more failure tests. - -TEST_P(AllTableWriterTest, tableWriterStats) { - const int32_t numBatches = 2; - auto rowType = - ROW({"c0", "p0", "c3", "c5"}, {VARCHAR(), BIGINT(), REAL(), VARCHAR()}); - std::vector partitionKeys = {"p0"}; - - VectorFuzzer::Options options; - options.vectorSize = 1000; - VectorFuzzer fuzzer(options, pool()); - // Partition vector is constant vector. - std::vector vectors = makeBatches(numBatches, [&](auto) { - return makeRowVector( - rowType->names(), - {fuzzer.fuzzFlat(VARCHAR()), - fuzzer.fuzzConstant(BIGINT()), - fuzzer.fuzzFlat(REAL()), - fuzzer.fuzzFlat(VARCHAR())}); - }); - createDuckDbTable(vectors); - - auto inputFilePaths = makeFilePaths(numBatches); - for (int i = 0; i < numBatches; i++) { - writeToFile(inputFilePaths[i]->getPath(), vectors[i]); - } - - auto outputDirectory = TempDirectoryPath::create(); - const int numWriters = getNumWriters(); - auto plan = createInsertPlan( - PlanBuilder().tableScan(rowType), - rowType, - outputDirectory->getPath(), - partitionKeys, - bucketProperty_, - compressionKind_, - numWriters, - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - - auto task = assertQueryWithWriterConfigs( - plan, inputFilePaths, "SELECT count(*) FROM tmp"); - - // Each batch would create a new partition, numWrittenFiles is same as - // partition num when not bucketed. When bucketed, it's partitionNum * - // bucketNum, bucket number is 4 - const int numWrittenFiles = - bucketProperty_ == nullptr ? numBatches : numBatches * 4; - // The size of bytes (ORC_MAGIC_LEN) written when the DWRF writer - // initializes a file. - const int32_t ORC_HEADER_LEN{3}; - const auto fixedWrittenBytes = - numWrittenFiles * (fileFormat_ == FileFormat::DWRF ? ORC_HEADER_LEN : 0); - - auto planStats = toPlanStats(task->taskStats()); - auto& stats = planStats.at(tableWriteNodeId_); - ASSERT_GT(stats.physicalWrittenBytes, fixedWrittenBytes); - ASSERT_GT( - stats.operatorStats.at("TableWrite")->physicalWrittenBytes, - fixedWrittenBytes); - ASSERT_EQ( - stats.operatorStats.at("TableWrite") - ->customStats.at(TableWriter::kNumWrittenFiles) - .sum, - numWrittenFiles); - ASSERT_GE( - stats.operatorStats.at("TableWrite") - ->customStats.at(TableWriter::kWriteIOTime) - .sum, - 0); - ASSERT_GE( - stats.operatorStats.at("TableWrite") - ->customStats.at(TableWriter::kRunningWallNanos) - .sum, - 0); -} - -DEBUG_ONLY_TEST_P( - UnpartitionedTableWriterTest, - fileWriterFlushErrorOnDriverClose) { - VectorFuzzer::Options options; - const int batchSize = 1000; - options.vectorSize = batchSize; - VectorFuzzer fuzzer(options, pool()); - const int numBatches = 10; - std::vector vectors; - int numRows{0}; - for (int i = 0; i < numBatches; ++i) { - numRows += batchSize; - vectors.push_back(fuzzer.fuzzRow(rowType_)); - } - std::atomic writeInputs{0}; - std::atomic triggerWriterOOM{false}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::addInput", - std::function([&](Operator* op) { - if (op->operatorType() != "TableWrite") { - return; - } - if (++writeInputs != 3) { - return; - } - op->operatorCtx()->task()->requestAbort(); - triggerWriterOOM = true; - })); - SCOPED_TESTVALUE_SET( - "facebook::velox::memory::MemoryPoolImpl::reserveThreadSafe", - std::function([&](memory::MemoryPool* pool) { - const std::string dictPoolRe(".*dictionary"); - const std::string generalPoolRe(".*general"); - const std::string compressionPoolRe(".*compression"); - if (!RE2::FullMatch(pool->name(), dictPoolRe) && - !RE2::FullMatch(pool->name(), generalPoolRe) && - !RE2::FullMatch(pool->name(), compressionPoolRe)) { - return; - } - if (!triggerWriterOOM) { - return; - } - VELOX_MEM_POOL_CAP_EXCEEDED("Inject write OOM"); - })); - - auto outputDirectory = TempDirectoryPath::create(); - auto op = createInsertPlan( - PlanBuilder().values(vectors), - rowType_, - outputDirectory->getPath(), - partitionedBy_, - bucketProperty_, - compressionKind_, - getNumWriters(), - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - - VELOX_ASSERT_THROW( - assertQuery(op, fmt::format("SELECT {}", numRows)), - "Aborted for external error"); -} - -DEBUG_ONLY_TEST_P(UnpartitionedTableWriterTest, dataSinkAbortError) { - if (fileFormat_ != FileFormat::DWRF) { - // NOTE: only test on dwrf writer format as we inject write error in dwrf - // writer. - return; - } - VectorFuzzer::Options options; - const int batchSize = 100; - options.vectorSize = batchSize; - VectorFuzzer fuzzer(options, pool()); - auto vector = fuzzer.fuzzInputRow(rowType_); - - std::atomic triggerWriterErrorOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::dwrf::Writer::write", - std::function([&](dwrf::Writer* /*unused*/) { - if (!triggerWriterErrorOnce.exchange(false)) { - return; - } - VELOX_FAIL("inject writer error"); - })); - - std::atomic triggerAbortErrorOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::connector::hive::HiveDataSink::closeInternal", - std::function( - [&](const HiveDataSink* /*unused*/) { - if (!triggerAbortErrorOnce.exchange(false)) { - return; - } - VELOX_FAIL("inject abort error"); - })); - - auto outputDirectory = TempDirectoryPath::create(); - auto plan = PlanBuilder() - .values({vector}) - .tableWrite(outputDirectory->getPath(), fileFormat_) + .tableScan(rowType) + .tableWriter(insertHandle, {"a", "b"}, {}) .planNode(); - VELOX_ASSERT_THROW( - AssertQueryBuilder(plan).copyResults(pool()), "inject writer error"); - ASSERT_FALSE(triggerWriterErrorOnce); - ASSERT_FALSE(triggerAbortErrorOnce); -} - -TEST_P(BucketSortOnlyTableWriterTest, sortWriterSpill) { - SCOPED_TRACE(testParam_.toString()); - - const auto vectors = makeVectors(5, 500); - createDuckDbTable(vectors); - - auto outputDirectory = TempDirectoryPath::create(); - auto op = createInsertPlan( - PlanBuilder().values(vectors), - rowType_, - outputDirectory->getPath(), - partitionedBy_, - bucketProperty_, - compressionKind_, - getNumWriters(), - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - - const auto spillStats = globalSpillStats(); - auto task = - assertQueryWithWriterConfigs(op, fmt::format("SELECT {}", 5 * 500), true); - if (partitionedBy_.size() > 0) { - rowType_ = getNonPartitionsColumns(partitionedBy_, rowType_); - verifyTableWriterOutput(outputDirectory->getPath(), rowType_); - } else { - verifyTableWriterOutput(outputDirectory->getPath(), rowType_); - } - - const auto updatedSpillStats = globalSpillStats(); - ASSERT_GT(updatedSpillStats.spilledBytes, spillStats.spilledBytes); - ASSERT_GT(updatedSpillStats.spilledPartitions, spillStats.spilledPartitions); - auto taskStats = toPlanStats(task->taskStats()); - auto& stats = taskStats.at(tableWriteNodeId_); - ASSERT_GT(stats.spilledRows, 0); - ASSERT_GT(stats.spilledBytes, 0); - // One spilled partition per each written files. - const int numWrittenFiles = stats.customStats["numWrittenFiles"].sum; - ASSERT_GE(stats.spilledPartitions, numWrittenFiles); - ASSERT_GT(stats.customStats[Operator::kSpillRuns].sum, 0); - ASSERT_GT(stats.customStats[Operator::kSpillFillTime].sum, 0); - ASSERT_GT(stats.customStats[Operator::kSpillSortTime].sum, 0); - ASSERT_GT(stats.customStats[Operator::kSpillExtractVectorTime].sum, 0); - ASSERT_GT(stats.customStats[Operator::kSpillSerializationTime].sum, 0); - ASSERT_GT(stats.customStats[Operator::kSpillFlushTime].sum, 0); - ASSERT_GT(stats.customStats[Operator::kSpillWrites].sum, 0); - ASSERT_GT(stats.customStats[Operator::kSpillWriteTime].sum, 0); + assertQuery(plan, "SELECT * FROM tmp"); } -DEBUG_ONLY_TEST_P(BucketSortOnlyTableWriterTest, outputBatchRows) { - struct { - uint32_t maxOutputRows; - std::string maxOutputBytes; - int expectedOutputCount; - - // TODO: add output size check with spilling enabled - std::string debugString() const { - return fmt::format( - "maxOutputRows: {}, maxOutputBytes: {}, expectedOutputCount: {}", - maxOutputRows, - maxOutputBytes, - expectedOutputCount); - } - } testSettings[] = {// we have 4 buckets thus 4 writers. - {10000, "1000kB", 4}, - // when maxOutputRows = 1, 1000 rows triggers 1000 writes - {1, "1kB", 1000}, - // estimatedRowSize is ~62bytes, when maxOutputSize = 62 * - // 100, 1000 rows triggers ~10 writes - {10000, "6200B", 12}}; - - for (const auto& testData : testSettings) { - SCOPED_TRACE(testData.debugString()); - std::atomic_int outputCount{0}; - SCOPED_TESTVALUE_SET( - "facebook::velox::dwrf::Writer::write", - std::function( - [&](dwrf::Writer* /*unused*/) { ++outputCount; })); - - auto rowType = - ROW({"c0", "p0", "c1", "c3", "c4", "c5"}, - {VARCHAR(), BIGINT(), INTEGER(), REAL(), DOUBLE(), VARCHAR()}); - std::vector partitionKeys = {"p0"}; - - // Partition vector is constant vector. - std::vector vectors = makeBatches(1, [&](auto) { - return makeRowVector( - rowType->names(), - {makeFlatVector( - 1'000, - [&](auto row) { - return StringView::makeInline(fmt::format("str_{}", row)); - }), - makeConstant((int64_t)365, 1'000), - makeConstant((int32_t)365, 1'000), - makeFlatVector(1'000, [&](auto row) { return row + 33.23; }), - makeFlatVector(1'000, [&](auto row) { return row + 33.23; }), - makeFlatVector(1'000, [&](auto row) { - return StringView::makeInline(fmt::format("bucket_{}", row * 3)); - })}); - }); - createDuckDbTable(vectors); - - auto outputDirectory = TempDirectoryPath::create(); - auto plan = createInsertPlan( - PlanBuilder().values({vectors}), - rowType, - outputDirectory->getPath(), - partitionKeys, - bucketProperty_, - compressionKind_, - 1, - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - const std::shared_ptr task = - AssertQueryBuilder(plan, duckDbQueryRunner_) - .config(QueryConfig::kTaskWriterCount, std::to_string(1)) - .connectorSessionProperty( - kHiveConnectorId, - HiveConfig::kSortWriterMaxOutputRowsSession, - folly::to(testData.maxOutputRows)) - .connectorSessionProperty( - kHiveConnectorId, - HiveConfig::kSortWriterMaxOutputBytesSession, - folly::to(testData.maxOutputBytes)) - .assertResults("SELECT count(*) FROM tmp"); - auto stats = task->taskStats().pipelineStats.front().operatorStats; - ASSERT_EQ(outputCount, testData.expectedOutputCount); - } -} - -DEBUG_ONLY_TEST_P(BucketSortOnlyTableWriterTest, yield) { - auto rowType = - ROW({"c0", "p0", "c1", "c3", "c4", "c5"}, - {VARCHAR(), BIGINT(), INTEGER(), REAL(), DOUBLE(), VARCHAR()}); - std::vector partitionKeys = {"p0"}; - - // Partition vector is constant vector. - std::vector vectors = makeBatches(1, [&](auto) { - return makeRowVector( - rowType->names(), - {makeFlatVector( - 1'000, - [&](auto row) { - return StringView::makeInline(fmt::format("str_{}", row)); - }), - makeConstant((int64_t)365, 1'000), - makeConstant((int32_t)365, 1'000), - makeFlatVector(1'000, [&](auto row) { return row + 33.23; }), - makeFlatVector(1'000, [&](auto row) { return row + 33.23; }), - makeFlatVector(1'000, [&](auto row) { - return StringView::makeInline(fmt::format("bucket_{}", row * 3)); - })}); - }); - createDuckDbTable(vectors); - - struct { - uint64_t flushTimeSliceLimitMs; - bool expectedYield; - - std::string debugString() const { - return fmt::format( - "flushTimeSliceLimitMs: {}, expectedYield: {}", - flushTimeSliceLimitMs, - expectedYield); - } - } testSettings[] = {{0, false}, {1, true}, {10'000, false}}; - for (const auto& testData : testSettings) { - SCOPED_TRACE(testData.debugString()); - std::atomic_bool injectDelayOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::dwrf::Writer::write", - std::function([&](dwrf::Writer* /*unused*/) { - if (!injectDelayOnce.exchange(false)) { - return; - } - std::this_thread::sleep_for(std::chrono::seconds(2)); - })); - createDuckDbTable(vectors); - - auto outputDirectory = TempDirectoryPath::create(); - auto plan = createInsertPlan( - PlanBuilder().values({vectors}), - rowType, - outputDirectory->getPath(), - partitionKeys, - bucketProperty_, - compressionKind_, - 1, - connector::hive::LocationHandle::TableType::kNew, - commitStrategy_); - const int prevYieldCount = Driver::yieldCount(); - const std::shared_ptr task = - AssertQueryBuilder(plan, duckDbQueryRunner_) - .config(QueryConfig::kTaskWriterCount, std::to_string(1)) - .connectorSessionProperty( - kHiveConnectorId, - HiveConfig::kSortWriterFinishTimeSliceLimitMsSession, - folly::to(testData.flushTimeSliceLimitMs)) - .connectorSessionProperty( - kHiveConnectorId, - HiveConfig::kSortWriterMaxOutputRowsSession, - folly::to(100)) - .connectorSessionProperty( - kHiveConnectorId, - HiveConfig::kSortWriterMaxOutputBytesSession, - folly::to("1KB")) - .assertResults("SELECT count(*) FROM tmp"); - auto stats = task->taskStats().pipelineStats.front().operatorStats; - if (testData.expectedYield) { - ASSERT_GT(Driver::yieldCount(), prevYieldCount); - } else { - ASSERT_EQ(Driver::yieldCount(), prevYieldCount); - } - } -} - -VELOX_INSTANTIATE_TEST_SUITE_P( - TableWriterTest, - UnpartitionedTableWriterTest, - testing::ValuesIn(UnpartitionedTableWriterTest::getTestParams())); - -VELOX_INSTANTIATE_TEST_SUITE_P( - TableWriterTest, - PartitionedTableWriterTest, - testing::ValuesIn(PartitionedTableWriterTest::getTestParams())); - -VELOX_INSTANTIATE_TEST_SUITE_P( - TableWriterTest, - BucketedTableOnlyWriteTest, - testing::ValuesIn(BucketedTableOnlyWriteTest::getTestParams())); - -VELOX_INSTANTIATE_TEST_SUITE_P( - TableWriterTest, - AllTableWriterTest, - testing::ValuesIn(AllTableWriterTest::getTestParams())); - -VELOX_INSTANTIATE_TEST_SUITE_P( - TableWriterTest, - PartitionedWithoutBucketTableWriterTest, - testing::ValuesIn( - PartitionedWithoutBucketTableWriterTest::getTestParams())); - -VELOX_INSTANTIATE_TEST_SUITE_P( - TableWriterTest, - BucketSortOnlyTableWriterTest, - testing::ValuesIn(BucketSortOnlyTableWriterTest::getTestParams())); - -class TableWriterArbitrationTest : public HiveConnectorTestBase { - protected: - void SetUp() override { - HiveConnectorTestBase::SetUp(); - filesystems::registerLocalFileSystem(); - if (!isRegisteredVectorSerde()) { - this->registerVectorSerde(); - } - - rowType_ = ROW( - {{"c0", INTEGER()}, - {"c1", INTEGER()}, - {"c2", VARCHAR()}, - {"c3", VARCHAR()}}); - fuzzerOpts_.vectorSize = 1024; - fuzzerOpts_.nullRatio = 0; - fuzzerOpts_.stringVariableLength = false; - fuzzerOpts_.stringLength = 1024; - fuzzerOpts_.allowLazyVector = false; - } - - folly::Random::DefaultGenerator rng_; - RowTypePtr rowType_; - VectorFuzzer::Options fuzzerOpts_; -}; - -DEBUG_ONLY_TEST_F(TableWriterArbitrationTest, reclaimFromTableWriter) { - VectorFuzzer::Options options; - const int batchSize = 1'000; - options.vectorSize = batchSize; - options.stringVariableLength = false; - options.stringLength = 1'000; - VectorFuzzer fuzzer(options, pool()); - const int numBatches = 20; - std::vector vectors; - int numRows{0}; - for (int i = 0; i < numBatches; ++i) { - numRows += batchSize; - vectors.push_back(fuzzer.fuzzRow(rowType_)); - } - createDuckDbTable(vectors); - - for (bool writerSpillEnabled : {false, true}) { - { - SCOPED_TRACE(fmt::format("writerSpillEnabled: {}", writerSpillEnabled)); - auto queryPool = memory::memoryManager()->addRootPool( - "reclaimFromTableWriter", kQueryMemoryCapacity); - auto* arbitrator = memory::memoryManager()->arbitrator(); - const int numPrevArbitrationFailures = arbitrator->stats().numFailures; - const int numPrevNonReclaimableAttempts = - arbitrator->stats().numNonReclaimableAttempts; - auto queryCtx = core::QueryCtx::create( - executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); - ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); - - std::atomic_int numInputs{0}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::addInput", - std::function(([&](Operator* op) { - if (op->operatorType() != "TableWrite") { - return; - } - // We reclaim memory from table writer connector memory pool which - // connects to the memory pools inside the hive connector. - ASSERT_FALSE(op->canReclaim()); - if (++numInputs != numBatches) { - return; - } - - const auto fakeAllocationSize = - kQueryMemoryCapacity - op->pool()->parent()->reservedBytes(); - if (writerSpillEnabled) { - auto* buffer = op->pool()->allocate(fakeAllocationSize); - op->pool()->free(buffer, fakeAllocationSize); - } else { - VELOX_ASSERT_THROW( - op->pool()->allocate(fakeAllocationSize), - "Exceeded memory pool"); - } - }))); - - auto spillDirectory = TempDirectoryPath::create(); - auto outputDirectory = TempDirectoryPath::create(); - core::PlanNodeId tableWriteNodeId; - auto writerPlan = - PlanBuilder() - .values(vectors) - .tableWrite(outputDirectory->getPath()) - .capturePlanNodeId(tableWriteNodeId) - .project({TableWriteTraits::rowCountColumnName()}) - .singleAggregation( - {}, - {fmt::format( - "sum({})", TableWriteTraits::rowCountColumnName())}) - .planNode(); - { - auto task = - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(queryCtx) - .maxDrivers(1) - .spillDirectory(spillDirectory->getPath()) - .config(core::QueryConfig::kSpillEnabled, writerSpillEnabled) - .config( - core::QueryConfig::kWriterSpillEnabled, writerSpillEnabled) - // Set 0 file writer flush threshold to always trigger flush - // in test. - .config(core::QueryConfig::kWriterFlushThresholdBytes, 0) - .plan(std::move(writerPlan)) - .assertResults(fmt::format("SELECT {}", numRows)); - auto planStats = toPlanStats(task->taskStats()); - auto& tableWriteStats = - planStats.at(tableWriteNodeId).operatorStats.at("TableWrite"); - if (writerSpillEnabled) { - ASSERT_GT( - tableWriteStats->customStats - .at(HiveDataSink::kEarlyFlushedRawBytes) - .count, - 0); - ASSERT_GT( - tableWriteStats->customStats - .at(HiveDataSink::kEarlyFlushedRawBytes) - .sum, - 0); - ASSERT_EQ( - arbitrator->stats().numFailures, numPrevArbitrationFailures); - } else { - ASSERT_EQ( - tableWriteStats->customStats.count( - HiveDataSink::kEarlyFlushedRawBytes), - 0); - ASSERT_EQ( - arbitrator->stats().numFailures, numPrevArbitrationFailures + 1); - } - ASSERT_EQ( - arbitrator->stats().numNonReclaimableAttempts, - numPrevNonReclaimableAttempts); - } - waitForAllTasksToBeDeleted(3'000'000); - } - } -} - -DEBUG_ONLY_TEST_F(TableWriterArbitrationTest, reclaimFromSortTableWriter) { - VectorFuzzer::Options options; - const int batchSize = 1'000; - options.vectorSize = batchSize; - options.stringVariableLength = false; - options.stringLength = 1'000; - VectorFuzzer fuzzer(options, pool()); - const int numBatches = 20; - std::vector vectors; - int numRows{0}; - const auto partitionKeyVector = makeFlatVector( - batchSize, [&](vector_size_t /*unused*/) { return 0; }); - for (int i = 0; i < numBatches; ++i) { - numRows += batchSize; - vectors.push_back(fuzzer.fuzzInputRow(rowType_)); - vectors.back()->childAt(0) = partitionKeyVector; - } - createDuckDbTable(vectors); - - for (bool writerSpillEnabled : {false, true}) { - { - SCOPED_TRACE(fmt::format("writerSpillEnabled: {}", writerSpillEnabled)); - auto queryPool = memory::memoryManager()->addRootPool( - "reclaimFromSortTableWriter", kQueryMemoryCapacity); - auto* arbitrator = memory::memoryManager()->arbitrator(); - const int numPrevArbitrationFailures = arbitrator->stats().numFailures; - const int numPrevNonReclaimableAttempts = - arbitrator->stats().numNonReclaimableAttempts; - auto queryCtx = core::QueryCtx::create( - executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); - ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); - - const auto spillStats = common::globalSpillStats(); - std::atomic numInputs{0}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::addInput", - std::function(([&](Operator* op) { - if (op->operatorType() != "TableWrite") { - return; - } - // We reclaim memory from table writer connector memory pool which - // connects to the memory pools inside the hive connector. - ASSERT_FALSE(op->canReclaim()); - if (++numInputs != numBatches) { - return; - } - - const auto fakeAllocationSize = - kQueryMemoryCapacity - op->pool()->parent()->reservedBytes(); - if (writerSpillEnabled) { - auto* buffer = op->pool()->allocate(fakeAllocationSize); - op->pool()->free(buffer, fakeAllocationSize); - } else { - VELOX_ASSERT_THROW( - op->pool()->allocate(fakeAllocationSize), - "Exceeded memory pool"); - } - }))); - - auto spillDirectory = TempDirectoryPath::create(); - auto outputDirectory = TempDirectoryPath::create(); - auto writerPlan = - PlanBuilder() - .values(vectors) - .tableWrite( - outputDirectory->getPath(), - {"c0"}, - 4, - {"c1"}, - { - std::make_shared( - "c2", core::SortOrder{false, false}), - }) - .project({TableWriteTraits::rowCountColumnName()}) - .singleAggregation( - {}, - {fmt::format( - "sum({})", TableWriteTraits::rowCountColumnName())}) - .planNode(); - - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(queryCtx) - .maxDrivers(1) - .spillDirectory(spillDirectory->getPath()) - .config(core::QueryConfig::kSpillEnabled, writerSpillEnabled) - .config(core::QueryConfig::kWriterSpillEnabled, writerSpillEnabled) - // Set 0 file writer flush threshold to always trigger flush in - // test. - .config(core::QueryConfig::kWriterFlushThresholdBytes, 0) - .plan(std::move(writerPlan)) - .assertResults(fmt::format("SELECT {}", numRows)); - - ASSERT_EQ( - arbitrator->stats().numFailures, - numPrevArbitrationFailures + (writerSpillEnabled ? 0 : 1)); - ASSERT_EQ( - arbitrator->stats().numNonReclaimableAttempts, - numPrevNonReclaimableAttempts); - - waitForAllTasksToBeDeleted(3'000'000); - const auto updatedSpillStats = common::globalSpillStats(); - if (writerSpillEnabled) { - ASSERT_GT(updatedSpillStats.spilledBytes, spillStats.spilledBytes); - ASSERT_GT( - updatedSpillStats.spilledPartitions, spillStats.spilledPartitions); - } else { - ASSERT_EQ(updatedSpillStats, spillStats); - } - } - } -} - -DEBUG_ONLY_TEST_F(TableWriterArbitrationTest, writerFlushThreshold) { - VectorFuzzer::Options options; - const int batchSize = 1'000; - options.vectorSize = batchSize; - options.stringVariableLength = false; - options.stringLength = 1'000; - const int numBatches = 20; - const int numRows = numBatches * batchSize; - std::vector vectors = - createVectors(numBatches, rowType_, options); - createDuckDbTable(vectors); - - struct TestParam { - uint64_t bytesToReserve{0}; - uint64_t writerFlushThreshold{0}; - }; - const std::vector testParams{ - {0, 0}, {0, 1UL << 30}, {64UL << 20, 1UL << 30}}; - for (const auto& testParam : testParams) { - SCOPED_TRACE(fmt::format( - "bytesToReserve: {}, writerFlushThreshold: {}", - succinctBytes(testParam.bytesToReserve), - succinctBytes(testParam.writerFlushThreshold))); - - auto queryPool = memory::memoryManager()->addRootPool( - "writerFlushThreshold", kQueryMemoryCapacity); - auto* arbitrator = memory::memoryManager()->arbitrator(); - const int numPrevArbitrationFailures = arbitrator->stats().numFailures; - const int numPrevNonReclaimableAttempts = - arbitrator->stats().numNonReclaimableAttempts; - auto queryCtx = core::QueryCtx::create( - executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); - ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); - - memory::MemoryPool* compressionPool{nullptr}; - SCOPED_TESTVALUE_SET( - "facebook::velox::dwrf::Writer::write", - std::function([&](dwrf::Writer* writer) { - if (testParam.bytesToReserve == 0 || compressionPool != nullptr) { - return; - } - compressionPool = &(writer->getContext().getMemoryPool( - dwrf::MemoryUsageCategory::OUTPUT_STREAM)); - })); - - std::atomic numInputs{0}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::addInput", - std::function(([&](Operator* op) { - if (op->operatorType() != "TableWrite") { - return; - } - if (++numInputs != numBatches) { - return; - } - - if (testParam.bytesToReserve > 0) { - ASSERT_TRUE(compressionPool != nullptr); - compressionPool->maybeReserve(testParam.bytesToReserve); - } - - const auto fakeAllocationSize = - kQueryMemoryCapacity - op->pool()->parent()->usedBytes(); - if (testParam.writerFlushThreshold == 0) { - auto* buffer = op->pool()->allocate(fakeAllocationSize); - op->pool()->free(buffer, fakeAllocationSize); - } else { - VELOX_ASSERT_THROW( - op->pool()->allocate(fakeAllocationSize), - "Exceeded memory pool"); - } - }))); - - auto spillDirectory = TempDirectoryPath::create(); - auto outputDirectory = TempDirectoryPath::create(); - auto writerPlan = - PlanBuilder() - .values(vectors) - .tableWrite(outputDirectory->getPath()) - .project({TableWriteTraits::rowCountColumnName()}) - .singleAggregation( - {}, - {fmt::format( - "sum({})", TableWriteTraits::rowCountColumnName())}) - .planNode(); - - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(queryCtx) - .maxDrivers(1) - .spillDirectory(spillDirectory->getPath()) - .config(core::QueryConfig::kSpillEnabled, true) - .config(core::QueryConfig::kWriterSpillEnabled, true) - .config( - core::QueryConfig::kWriterFlushThresholdBytes, - testParam.writerFlushThreshold) - .plan(std::move(writerPlan)) - .assertResults(fmt::format("SELECT {}", numRows)); - - ASSERT_EQ( - arbitrator->stats().numFailures, - numPrevArbitrationFailures + - (testParam.writerFlushThreshold == 0 ? 0 : 1)); - // We don't trigger reclaim on a writer if it doesn't meet the writer - // flush threshold. - ASSERT_EQ( - arbitrator->stats().numNonReclaimableAttempts, - numPrevNonReclaimableAttempts); - ASSERT_GE(arbitrator->stats().reclaimedUsedBytes, testParam.bytesToReserve); - waitForAllTasksToBeDeleted(3'000'000); - queryCtx.reset(); - } -} - -DEBUG_ONLY_TEST_F( - TableWriterArbitrationTest, - reclaimFromNonReclaimableTableWriter) { - VectorFuzzer::Options options; - const int batchSize = 1'000; - options.vectorSize = batchSize; - options.stringVariableLength = false; - options.stringLength = 1'000; - VectorFuzzer fuzzer(options, pool()); - const int numBatches = 20; - std::vector vectors; - int numRows{0}; - for (int i = 0; i < numBatches; ++i) { - numRows += batchSize; - vectors.push_back(fuzzer.fuzzRow(rowType_)); - } - - createDuckDbTable(vectors); - - auto queryPool = memory::memoryManager()->addRootPool( - "reclaimFromNonReclaimableTableWriter", kQueryMemoryCapacity); - auto* arbitrator = memory::memoryManager()->arbitrator(); - const int numPrevArbitrationFailures = arbitrator->stats().numFailures; - const int numPrevNonReclaimableAttempts = - arbitrator->stats().numNonReclaimableAttempts; - auto queryCtx = core::QueryCtx::create( - executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); - ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); - - std::atomic injectFakeAllocationOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::dwrf::Writer::write", - std::function(([&](dwrf::Writer* writer) { - if (!injectFakeAllocationOnce.exchange(false)) { - return; - } - auto& pool = writer->getContext().getMemoryPool( - dwrf::MemoryUsageCategory::GENERAL); - const auto fakeAllocationSize = - kQueryMemoryCapacity - pool.reservedBytes(); - VELOX_ASSERT_THROW( - pool.allocate(fakeAllocationSize), "Exceeded memory pool"); - }))); - - auto outputDirectory = TempDirectoryPath::create(); - auto writerPlan = - PlanBuilder() - .values(vectors) - .tableWrite(outputDirectory->getPath()) - .project({TableWriteTraits::rowCountColumnName()}) - .singleAggregation( - {}, - {fmt::format("sum({})", TableWriteTraits::rowCountColumnName())}) - .planNode(); - - const auto spillDirectory = TempDirectoryPath::create(); - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(queryCtx) - .maxDrivers(1) - .spillDirectory(spillDirectory->getPath()) - .config(core::QueryConfig::kSpillEnabled, true) - .config(core::QueryConfig::kWriterSpillEnabled, true) - // Set file writer flush threshold of zero to always trigger flush in - // test. - .config(core::QueryConfig::kWriterFlushThresholdBytes, 0) - // Set large stripe and dictionary size thresholds to avoid writer - // internal stripe flush. - .connectorSessionProperty( - kHiveConnectorId, dwrf::Config::kOrcWriterMaxStripeSizeSession, "1GB") - .connectorSessionProperty( - kHiveConnectorId, - dwrf::Config::kOrcWriterMaxDictionaryMemorySession, - "1GB") - .plan(std::move(writerPlan)) - .assertResults(fmt::format("SELECT {}", numRows)); - - ASSERT_EQ(arbitrator->stats().numFailures, numPrevArbitrationFailures + 1); - ASSERT_EQ( - arbitrator->stats().numNonReclaimableAttempts, - numPrevNonReclaimableAttempts + 1); - waitForAllTasksToBeDeleted(); -} - -DEBUG_ONLY_TEST_F( - TableWriterArbitrationTest, - arbitrationFromTableWriterWithNoMoreInput) { - VectorFuzzer::Options options; - const int batchSize = 1'000; - options.vectorSize = batchSize; - options.stringVariableLength = false; - options.stringLength = 1'000; - VectorFuzzer fuzzer(options, pool()); - const int numBatches = 10; - std::vector vectors; - int numRows{0}; - for (int i = 0; i < numBatches; ++i) { - numRows += batchSize; - vectors.push_back(fuzzer.fuzzRow(rowType_)); - } - - createDuckDbTable(vectors); - auto queryPool = memory::memoryManager()->addRootPool( - "arbitrationFromTableWriterWithNoMoreInput", kQueryMemoryCapacity); - auto* arbitrator = memory::memoryManager()->arbitrator(); - const int numPrevArbitrationFailures = arbitrator->stats().numFailures; - const int numPrevNonReclaimableAttempts = - arbitrator->stats().numNonReclaimableAttempts; - const int numPrevReclaimedBytes = arbitrator->stats().reclaimedUsedBytes; - auto queryCtx = core::QueryCtx::create( - executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); - ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); - - std::atomic writerNoMoreInput{false}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::noMoreInput", - std::function(([&](Operator* op) { - if (op->operatorType() != "TableWrite") { - return; - } - writerNoMoreInput = true; - }))); - - std::atomic injectGetOutputOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::getOutput", - std::function(([&](Operator* op) { - if (op->operatorType() != "TableWrite") { - return; - } - if (!writerNoMoreInput) { - return; - } - if (!injectGetOutputOnce.exchange(false)) { - return; - } - const auto fakeAllocationSize = - kQueryMemoryCapacity - op->pool()->parent()->reservedBytes(); - auto* buffer = op->pool()->allocate(fakeAllocationSize); - op->pool()->free(buffer, fakeAllocationSize); - }))); - - auto outputDirectory = TempDirectoryPath::create(); - auto writerPlan = - PlanBuilder() - .values(vectors) - .tableWrite(outputDirectory->getPath()) - .project({TableWriteTraits::rowCountColumnName()}) - .singleAggregation( - {}, - {fmt::format("sum({})", TableWriteTraits::rowCountColumnName())}) - .planNode(); - - const auto spillDirectory = TempDirectoryPath::create(); - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(queryCtx) - .maxDrivers(1) - .spillDirectory(spillDirectory->getPath()) - .config(core::QueryConfig::kSpillEnabled, true) - .config(core::QueryConfig::kWriterSpillEnabled, true) - // Set 0 file writer flush threshold to always trigger flush in test. - .config(core::QueryConfig::kWriterFlushThresholdBytes, 0) - // Set large stripe and dictionary size thresholds to avoid writer - // internal stripe flush. - .connectorSessionProperty( - kHiveConnectorId, dwrf::Config::kOrcWriterMaxStripeSizeSession, "1GB") - .connectorSessionProperty( - kHiveConnectorId, - dwrf::Config::kOrcWriterMaxDictionaryMemorySession, - "1GB") - .plan(std::move(writerPlan)) - .assertResults(fmt::format("SELECT {}", numRows)); - - ASSERT_EQ( - arbitrator->stats().numNonReclaimableAttempts, - numPrevArbitrationFailures); - ASSERT_EQ(arbitrator->stats().numFailures, numPrevNonReclaimableAttempts); - ASSERT_GT(arbitrator->stats().reclaimedUsedBytes, numPrevReclaimedBytes); - waitForAllTasksToBeDeleted(); -} - -DEBUG_ONLY_TEST_F( - TableWriterArbitrationTest, - reclaimFromNonReclaimableSortTableWriter) { - VectorFuzzer::Options options; - const int batchSize = 1'000; - options.vectorSize = batchSize; - options.stringVariableLength = false; - options.stringLength = 1'000; - VectorFuzzer fuzzer(options, pool()); - const int numBatches = 20; - std::vector vectors; - int numRows{0}; - const auto partitionKeyVector = makeFlatVector( - batchSize, [&](vector_size_t /*unused*/) { return 0; }); - for (int i = 0; i < numBatches; ++i) { - numRows += batchSize; - vectors.push_back(fuzzer.fuzzInputRow(rowType_)); - vectors.back()->childAt(0) = partitionKeyVector; - } - - createDuckDbTable(vectors); - - auto queryPool = memory::memoryManager()->addRootPool( - "reclaimFromNonReclaimableSortTableWriter", kQueryMemoryCapacity); - auto* arbitrator = memory::memoryManager()->arbitrator(); - const int numPrevArbitrationFailures = arbitrator->stats().numFailures; - const int numPrevNonReclaimableAttempts = - arbitrator->stats().numNonReclaimableAttempts; - auto queryCtx = core::QueryCtx::create( - executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); - ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); - - std::atomic injectFakeAllocationOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::memory::MemoryPoolImpl::reserveThreadSafe", - std::function(([&](memory::MemoryPool* pool) { - const std::string re(".*sort"); - if (!RE2::FullMatch(pool->name(), re)) { - return; - } - const int writerMemoryUsage = 4L << 20; - if (pool->parent()->reservedBytes() < writerMemoryUsage) { - return; - } - if (!injectFakeAllocationOnce.exchange(false)) { - return; - } - const auto fakeAllocationSize = - kQueryMemoryCapacity - pool->parent()->reservedBytes(); - VELOX_ASSERT_THROW( - pool->allocate(fakeAllocationSize), "Exceeded memory pool"); - }))); - - auto outputDirectory = TempDirectoryPath::create(); - auto writerPlan = - PlanBuilder() - .values(vectors) - .tableWrite( - outputDirectory->getPath(), - {"c0"}, - 4, - {"c1"}, - { - std::make_shared( - "c2", core::SortOrder{false, false}), - }) - .project({TableWriteTraits::rowCountColumnName()}) - .singleAggregation( - {}, - {fmt::format("sum({})", TableWriteTraits::rowCountColumnName())}) - .planNode(); - - const auto spillStats = common::globalSpillStats(); - const auto spillDirectory = TempDirectoryPath::create(); - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(queryCtx) - .maxDrivers(1) - .spillDirectory(spillDirectory->getPath()) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(core::QueryConfig::kWriterSpillEnabled, "true") - // Set file writer flush threshold of zero to always trigger flush in - // test. - .config(core::QueryConfig::kWriterFlushThresholdBytes, "0") - // Set large stripe and dictionary size thresholds to avoid writer - // internal stripe flush. - .connectorSessionProperty( - kHiveConnectorId, dwrf::Config::kOrcWriterMaxStripeSizeSession, "1GB") - .connectorSessionProperty( - kHiveConnectorId, - dwrf::Config::kOrcWriterMaxDictionaryMemorySession, - "1GB") - .plan(std::move(writerPlan)) - .assertResults(fmt::format("SELECT {}", numRows)); - - ASSERT_EQ(arbitrator->stats().numFailures, numPrevArbitrationFailures + 1); - ASSERT_EQ( - arbitrator->stats().numNonReclaimableAttempts, - numPrevNonReclaimableAttempts + 1); - const auto updatedSpillStats = common::globalSpillStats(); - ASSERT_EQ(updatedSpillStats, spillStats); - waitForAllTasksToBeDeleted(); -} - -DEBUG_ONLY_TEST_F(TableWriterArbitrationTest, tableFileWriteError) { - VectorFuzzer::Options options; - const int batchSize = 1'000; - options.vectorSize = batchSize; - options.stringVariableLength = false; - options.stringLength = 1'000; - VectorFuzzer fuzzer(options, pool()); - const int numBatches = 20; - std::vector vectors; - for (int i = 0; i < numBatches; ++i) { - vectors.push_back(fuzzer.fuzzRow(rowType_)); - } - - createDuckDbTable(vectors); - - auto queryPool = memory::memoryManager()->addRootPool( - "tableFileWriteError", kQueryMemoryCapacity); - auto queryCtx = core::QueryCtx::create( - executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); - ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); - - std::atomic_bool injectWriterErrorOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::dwrf::Writer::write", - std::function(([&](dwrf::Writer* writer) { - auto& context = writer->getContext(); - auto& pool = - context.getMemoryPool(dwrf::MemoryUsageCategory::OUTPUT_STREAM); - if (static_cast(&pool) - ->testingMinReservationBytes() == 0) { - return; - } - if (!injectWriterErrorOnce.exchange(false)) { - return; - } - VELOX_FAIL("inject writer error"); - }))); - - const auto spillDirectory = TempDirectoryPath::create(); - const auto outputDirectory = TempDirectoryPath::create(); - auto writerPlan = PlanBuilder() - .values(vectors) - .tableWrite(outputDirectory->getPath()) - .planNode(); - VELOX_ASSERT_THROW( - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(queryCtx) - .maxDrivers(1) - .spillDirectory(spillDirectory->getPath()) - .config(core::QueryConfig::kSpillEnabled, true) - .config(core::QueryConfig::kWriterSpillEnabled, true) - // Set 0 file writer flush threshold to always reclaim memory from - // file writer. - .config(core::QueryConfig::kWriterFlushThresholdBytes, 0) - // Set stripe size to extreme large to avoid writer internal - // triggered flush. - .connectorSessionProperty( - kHiveConnectorId, - dwrf::Config::kOrcWriterMaxStripeSizeSession, - "1GB") - .connectorSessionProperty( - kHiveConnectorId, - dwrf::Config::kOrcWriterMaxDictionaryMemorySession, - "1GB") - .plan(std::move(writerPlan)) - .copyResults(pool()), - "inject writer error"); - - waitForAllTasksToBeDeleted(); -} - -DEBUG_ONLY_TEST_F(TableWriterArbitrationTest, tableWriteSpillUseMoreMemory) { - // Create a large number of vectors to trigger writer spill. - fuzzerOpts_.vectorSize = 1000; - fuzzerOpts_.stringLength = 2048; - fuzzerOpts_.stringVariableLength = false; - VectorFuzzer fuzzer(fuzzerOpts_, pool()); - - std::vector vectors; - for (int i = 0; i < 10; ++i) { - vectors.push_back(fuzzer.fuzzInputRow(rowType_)); - } - - auto queryPool = memory::memoryManager()->addRootPool( - "tableWriteSpillUseMoreMemory", kQueryMemoryCapacity / 4); - auto queryCtx = core::QueryCtx::create( - executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); - ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity / 4); - - auto fakeLeafPool = queryCtx->pool()->addLeafChild( - "fakeLeaf", true, FakeMemoryReclaimer::create()); - const int fakeAllocationSize = kQueryMemoryCapacity * 3 / 16; - TestAllocation injectedFakeAllocation{ - fakeLeafPool.get(), - fakeLeafPool->allocate(fakeAllocationSize), - fakeAllocationSize}; - - TestAllocation injectedWriterAllocation; - SCOPED_TESTVALUE_SET( - "facebook::velox::dwrf::Writer::flushInternal", - std::function(([&](dwrf::Writer* writer) { - ASSERT_TRUE(memory::underMemoryArbitration()); - injectedFakeAllocation.free(); - auto& pool = writer->getContext().getMemoryPool( - dwrf::MemoryUsageCategory::GENERAL); - injectedWriterAllocation.pool = &pool; - injectedWriterAllocation.size = kQueryMemoryCapacity / 8; - injectedWriterAllocation.buffer = - pool.allocate(injectedWriterAllocation.size); - }))); - - // Free the extra fake memory allocations to make memory pool state - // consistent at the end of test. - std::atomic_bool clearAllocationOnce{true}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Task::setError", - std::function(([&](Task* task) { - if (!clearAllocationOnce.exchange(false)) { - return; - } - ASSERT_EQ(injectedWriterAllocation.size, kQueryMemoryCapacity / 8); - injectedWriterAllocation.free(); - }))); - - const auto spillDirectory = TempDirectoryPath::create(); - const auto outputDirectory = TempDirectoryPath::create(); - auto writerPlan = PlanBuilder() - .values(vectors) - .tableWrite(outputDirectory->getPath()) - .planNode(); - VELOX_ASSERT_THROW( - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(queryCtx) - .maxDrivers(1) - .spillDirectory(spillDirectory->getPath()) - .config(core::QueryConfig::kSpillEnabled, true) - .config(core::QueryConfig::kWriterSpillEnabled, true) - // Set 0 file writer flush threshold to always trigger flush in - // test. - .config(core::QueryConfig::kWriterFlushThresholdBytes, 0) - // Set stripe size to extreme large to avoid writer internal - // triggered flush. - .connectorSessionProperty( - kHiveConnectorId, - dwrf::Config::kOrcWriterMaxStripeSizeSession, - "1GB") - .connectorSessionProperty( - kHiveConnectorId, - dwrf::Config::kOrcWriterMaxDictionaryMemorySession, - "1GB") - .plan(std::move(writerPlan)) - .copyResults(pool()), - ""); - - waitForAllTasksToBeDeleted(); -} - -DEBUG_ONLY_TEST_F(TableWriterArbitrationTest, tableWriteReclaimOnClose) { - // Create a large number of vectors to trigger writer spill. - fuzzerOpts_.vectorSize = 1000; - fuzzerOpts_.stringLength = 1024; - fuzzerOpts_.stringVariableLength = false; - VectorFuzzer fuzzer(fuzzerOpts_, pool()); - std::vector vectors; - int numRows{0}; - for (int i = 0; i < 10; ++i) { - vectors.push_back(fuzzer.fuzzInputRow(rowType_)); - numRows += vectors.back()->size(); - } - - auto queryPool = memory::memoryManager()->addRootPool( - "tableWriteSpillUseMoreMemory", kQueryMemoryCapacity); - auto queryCtx = core::QueryCtx::create( - executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); - ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); - - auto fakeQueryPool = - memory::memoryManager()->addRootPool("fake", kQueryMemoryCapacity); - auto fakeQueryCtx = core::QueryCtx::create( - executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(fakeQueryPool)); - ASSERT_EQ(fakeQueryCtx->pool()->capacity(), kQueryMemoryCapacity); - - auto fakeLeafPool = fakeQueryCtx->pool()->addLeafChild( - "fakeLeaf", true, FakeMemoryReclaimer::create()); - - std::atomic_bool writerNoMoreInput{false}; - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Driver::runInternal::noMoreInput", - std::function(([&](Operator* op) { - if (op->operatorType() != "TableWrite") { - return; - } - writerNoMoreInput = true; - }))); - - std::atomic maybeReserveInjectOnce{true}; - TestAllocation fakeAllocation; - SCOPED_TESTVALUE_SET( - "facebook::velox::common::memory::MemoryPoolImpl::maybeReserve", - std::function([&](memory::MemoryPool* pool) { - if (!writerNoMoreInput) { - return; - } - if (!maybeReserveInjectOnce.exchange(false)) { - return; - } - // The injection memory allocation to cause maybeReserve on writer - // close to trigger memory arbitration. The latter tries to reclaim - // memory from this file writer. - const size_t injectAllocationSize = kQueryMemoryCapacity; - fakeAllocation = TestAllocation{ - .pool = fakeLeafPool.get(), - .buffer = fakeLeafPool->allocate(injectAllocationSize), - .size = injectAllocationSize}; - })); - - SCOPED_TESTVALUE_SET( - "facebook::velox::dwrf::Writer::flushStripe", - std::function( - [&](dwrf::Writer* writer) { fakeAllocation.free(); })); - - const auto spillDirectory = TempDirectoryPath::create(); - const auto outputDirectory = TempDirectoryPath::create(); - auto writerPlan = - PlanBuilder() - .values(vectors) - .tableWrite(outputDirectory->getPath()) - .singleAggregation( - {}, - {fmt::format("sum({})", TableWriteTraits::rowCountColumnName())}) - .planNode(); - - AssertQueryBuilder(duckDbQueryRunner_) - .queryCtx(queryCtx) - .maxDrivers(1) - .spillDirectory(spillDirectory->getPath()) - .config(core::QueryConfig::kSpillEnabled, true) - .config(core::QueryConfig::kWriterSpillEnabled, true) - // Set 0 file writer flush threshold to always trigger flush in test. - .config(core::QueryConfig::kWriterFlushThresholdBytes, 0) - // Set stripe size to extreme large to avoid writer internal triggered - // flush. - .connectorSessionProperty( - kHiveConnectorId, dwrf::Config::kOrcWriterMaxStripeSizeSession, "1GB") - .connectorSessionProperty( - kHiveConnectorId, - dwrf::Config::kOrcWriterMaxDictionaryMemorySession, - "1GB") - .plan(std::move(writerPlan)) - .assertResults(fmt::format("SELECT {}", numRows)); - - waitForAllTasksToBeDeleted(); -} - -DEBUG_ONLY_TEST_F( - TableWriterArbitrationTest, - raceBetweenWriterCloseAndTaskReclaim) { - const uint64_t memoryCapacity = 512 * MB; - std::vector vectors = - createVectors(rowType_, memoryCapacity / 8, fuzzerOpts_); - const auto expectedResult = - runWriteTask(vectors, nullptr, false, 1, pool(), kHiveConnectorId, false) - .data; - auto queryPool = memory::memoryManager()->addRootPool( - "tableWriteSpillUseMoreMemory", kQueryMemoryCapacity); - auto queryCtx = core::QueryCtx::create( - executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); - ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); - - std::atomic_bool writerCloseWaitFlag{true}; - folly::EventCount writerCloseWait; - std::atomic_bool taskReclaimWaitFlag{true}; - folly::EventCount taskReclaimWait; - SCOPED_TESTVALUE_SET( - "facebook::velox::dwrf::Writer::flushStripe", - std::function(([&](dwrf::Writer* writer) { - writerCloseWaitFlag = false; - writerCloseWait.notifyAll(); - taskReclaimWait.await([&]() { return !taskReclaimWaitFlag.load(); }); - }))); - - SCOPED_TESTVALUE_SET( - "facebook::velox::exec::Task::requestPauseLocked", - std::function(([&](Task* /*unused*/) { - taskReclaimWaitFlag = false; - taskReclaimWait.notifyAll(); - }))); - - std::thread queryThread([&]() { - const auto result = runWriteTask( - vectors, - queryCtx, - false, - 1, - pool(), - kHiveConnectorId, - true, - expectedResult); - }); - - writerCloseWait.await([&]() { return !writerCloseWaitFlag.load(); }); - - memory::testingRunArbitration(); - - queryThread.join(); - waitForAllTasksToBeDeleted(); -} -} // namespace velox::exec::test +// #include "velox/exec/tests/utils/TableWriterTestBase.h" +// +// #include "folly/dynamic.h" +// #include "velox/common/base/Fs.h" +// #include "velox/common/base/tests/GTestUtils.h" +// #include "velox/common/hyperloglog/SparseHll.h" +// #include "velox/common/testutil/TestValue.h" +// #include "velox/connectors/hive/HiveConfig.h" +// #include "velox/connectors/hive/HivePartitionFunction.h" +// #include "velox/dwio/common/WriterFactory.h" +// #include "velox/exec/PlanNodeStats.h" +// #include "velox/exec/TableWriter.h" +// #include "velox/exec/tests/utils/AssertQueryBuilder.h" +// #include "velox/exec/tests/utils/HiveConnectorTestBase.h" +// #include "velox/exec/tests/utils/PlanBuilder.h" +// #include "velox/exec/tests/utils/TempDirectoryPath.h" +// #include "velox/vector/fuzzer/VectorFuzzer.h" +// +// #include +// #include +// #include "folly/experimental/EventCount.h" +// #include "velox/common/memory/MemoryArbitrator.h" +// #include "velox/dwio/common/Options.h" +// #include "velox/dwio/dwrf/writer/Writer.h" +// #include "velox/exec/tests/utils/ArbitratorTestUtil.h" +// +// namespace velox::exec::test { +// constexpr uint64_t kQueryMemoryCapacity = 512 * MB; +// +// class BasicTableWriterTestBase : public HiveConnectorTestBase {}; +// +// TEST_F(BasicTableWriterTestBase, roundTrip) { +// vector_size_t size = 1'000; +// auto data = makeRowVector({ +// makeFlatVector(size, [](auto row) { return row; }), +// makeFlatVector( +// size, [](auto row) { return row * 2; }, nullEvery(7)), +// }); +// +// auto sourceFilePath = TempFilePath::create(); +// writeToFile(sourceFilePath->getPath(), data); +// +// auto targetDirectoryPath = TempDirectoryPath::create(); +// +// auto rowType = asRowType(data->type()); +// auto plan = PlanBuilder() +// .tableScan(rowType) +// .tableWrite(targetDirectoryPath->getPath()) +// .planNode(); +// +// auto results = AssertQueryBuilder(plan) +// .split(makeHiveConnectorSplit(sourceFilePath->getPath())) +// .copyResults(pool()); +// ASSERT_EQ(2, results->size()); +// +// // First column has number of rows written in the first row and nulls in +// other +// // rows. +// auto rowCount = results->childAt(TableWriteTraits::kRowCountChannel) +// ->as>(); +// ASSERT_FALSE(rowCount->isNullAt(0)); +// ASSERT_EQ(size, rowCount->valueAt(0)); +// ASSERT_TRUE(rowCount->isNullAt(1)); +// +// // Second column contains details about written files. +// auto details = results->childAt(TableWriteTraits::kFragmentChannel) +// ->as>(); +// ASSERT_TRUE(details->isNullAt(0)); +// ASSERT_FALSE(details->isNullAt(1)); +// folly::dynamic obj = folly::parseJson(details->valueAt(1)); +// +// ASSERT_EQ(size, obj["rowCount"].asInt()); +// auto fileWriteInfos = obj["fileWriteInfos"]; +// ASSERT_EQ(1, fileWriteInfos.size()); +// +// auto writeFileName = fileWriteInfos[0]["writeFileName"].asString(); +// +// // Read from 'writeFileName' and verify the data matches the original. +// plan = PlanBuilder().tableScan(rowType).planNode(); +// +// auto copy = AssertQueryBuilder(plan) +// .split(makeHiveConnectorSplit(fmt::format( +// "{}/{}", targetDirectoryPath->getPath(), +// writeFileName))) +// .copyResults(pool()); +// assertEqualResults({data}, {copy}); +// } +// +//// Generates a struct (row), write it as a flap map, and check that it is read +//// back as a map. +// TEST_F(BasicTableWriterTestBase, structAsMap) { +// // Input struct type. +// vector_size_t size = 1'000; +// auto data = makeRowVector( +// {"col1"}, +// { +// makeRowVector( +// // Struct field names are the feature/map keys. +// {"1", "2"}, +// { +// makeFlatVector(size, [](auto row) { return row; +// }), makeFlatVector(size, [](auto row) { return +// row; }), +// }), +// }); +// +// // Write it as a flat map. +// auto outputType = ROW({"col1"}, {MAP(INTEGER(), INTEGER())}); +// auto targetDirectoryPath = TempDirectoryPath::create(); +// std::string fileName = "output_file"; +// +// auto plan = PlanBuilder() +// .values({data}) +// .tableWrite( +// targetDirectoryPath->getPath(), +// {}, +// 0, +// {}, +// {}, +// dwio::common::FileFormat::DWRF, +// {}, +// PlanBuilder::kHiveDefaultConnectorId, +// { +// {"orc.flatten.map", "true"}, +// {"orc.map.flat.cols", "0"}, +// {"orc.map.flat.cols.struct.keys", "[[\"1\", +// \"2\"]]"}, +// }, +// nullptr, +// fileName, +// velox::common::CompressionKind_NONE, +// outputType) +// .planNode(); +// auto writerResults = AssertQueryBuilder(plan).copyResults(pool()); +// +// // Check we get the expected map after reading. +// auto expected = makeRowVector( +// {"col1"}, +// { +// makeMapVector( +// size, +// [](auto /*row*/) { return 2; }, +// [](auto row) { return row % 2 == 0 ? 2 : 1; }, +// [](auto row) { return row / 2; }), +// }); +// plan = PlanBuilder().tableScan(outputType).planNode(); +// AssertQueryBuilder(plan) +// .split(makeHiveConnectorSplit( +// targetDirectoryPath->getPath() + "/" + fileName)) +// .assertResults(expected); +// } +// +// TEST_F(BasicTableWriterTestBase, targetFileName) { +// constexpr const char* kFileName = "test.dwrf"; +// auto data = makeRowVector({makeFlatVector(10, folly::identity)}); +// auto directory = TempDirectoryPath::create(); +// auto plan = PlanBuilder() +// .values({data}) +// .tableWrite( +// directory->getPath(), +// dwio::common::FileFormat::DWRF, +// {}, +// nullptr, +// kFileName) +// .planNode(); +// auto results = AssertQueryBuilder(plan).copyResults(pool()); +// auto* details = results->childAt(TableWriteTraits::kFragmentChannel) +// ->asUnchecked>(); +// auto detail = folly::parseJson(details->valueAt(1)); +// auto fileWriteInfos = detail["fileWriteInfos"]; +// ASSERT_EQ(1, fileWriteInfos.size()); +// ASSERT_EQ(fileWriteInfos[0]["writeFileName"].asString(), kFileName); +// plan = PlanBuilder().tableScan(asRowType(data->type())).planNode(); +// AssertQueryBuilder(plan) +// .split(makeHiveConnectorSplit( +// fmt::format("{}/{}", directory->getPath(), kFileName))) +// .assertResults(data); +// } +// +// class PartitionedTableWriterTest +// : public TableWriterTestBase, +// public testing::WithParamInterface { +// public: +// PartitionedTableWriterTest() : TableWriterTestBase(GetParam()) {} +// +// static std::vector getTestParams() { +// std::vector testParams; +// const std::vector multiDriverOptions = {false, true}; +// std::vector fileFormats = {FileFormat::DWRF}; +// if (hasWriterFactory(FileFormat::PARQUET)) { +// fileFormats.push_back(FileFormat::PARQUET); +// } +// for (bool multiDrivers : multiDriverOptions) { +// for (FileFormat fileFormat : fileFormats) { +// for (bool scaleWriter : {false, true}) { +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kPartitioned, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kPartitioned, +// connector::common::CommitStrategy::kTaskCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kBucketed, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kBucketed, +// connector::common::CommitStrategy::kTaskCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kBucketed, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kPrestoNative, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kBucketed, +// connector::common::CommitStrategy::kTaskCommit, +// HiveBucketProperty::Kind::kPrestoNative, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// } +// } +// } +// return testParams; +// } +// }; +// +// class UnpartitionedTableWriterTest +// : public TableWriterTestBase, +// public testing::WithParamInterface { +// public: +// UnpartitionedTableWriterTest() : TableWriterTestBase(GetParam()) {} +// +// static std::vector getTestParams() { +// std::vector testParams; +// const std::vector multiDriverOptions = {false, true}; +// std::vector fileFormats = {FileFormat::DWRF}; +// if (hasWriterFactory(FileFormat::PARQUET)) { +// fileFormats.push_back(FileFormat::PARQUET); +// } +// for (bool multiDrivers : multiDriverOptions) { +// for (FileFormat fileFormat : fileFormats) { +// for (bool scaleWriter : {false, true}) { +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kUnpartitioned, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_NONE, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kUnpartitioned, +// connector::common::CommitStrategy::kTaskCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_NONE, +// scaleWriter} +// .value); +// } +// } +// } +// return testParams; +// } +// }; +// +// class BucketedTableOnlyWriteTest +// : public TableWriterTestBase, +// public testing::WithParamInterface { +// public: +// BucketedTableOnlyWriteTest() : TableWriterTestBase(GetParam()) {} +// +// static std::vector getTestParams() { +// std::vector testParams; +// const std::vector multiDriverOptions = {false, true}; +// std::vector fileFormats = {FileFormat::DWRF}; +// if (hasWriterFactory(FileFormat::PARQUET)) { +// fileFormats.push_back(FileFormat::PARQUET); +// } +// const std::vector bucketModes = { +// TestMode::kBucketed, TestMode::kOnlyBucketed}; +// for (bool multiDrivers : multiDriverOptions) { +// for (FileFormat fileFormat : fileFormats) { +// for (auto bucketMode : bucketModes) { +// testParams.push_back(TestParam{ +// fileFormat, +// bucketMode, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// /*scaleWriter=*/false} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// bucketMode, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// true, +// multiDrivers, +// CompressionKind_ZSTD, +// /*scaleWriter=*/false} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// bucketMode, +// connector::common::CommitStrategy::kTaskCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// /*scaleWriter=*/false} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// bucketMode, +// connector::common::CommitStrategy::kTaskCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// true, +// multiDrivers, +// CompressionKind_ZSTD, +// /*scaleWriter=*/false} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// bucketMode, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kPrestoNative, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// /*scaleWriter=*/false} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// bucketMode, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kPrestoNative, +// true, +// multiDrivers, +// CompressionKind_ZSTD, +// /*scaleWriter=*/false} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// bucketMode, +// connector::common::CommitStrategy::kTaskCommit, +// HiveBucketProperty::Kind::kPrestoNative, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// /*scaleWriter=*/false} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// bucketMode, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kPrestoNative, +// true, +// multiDrivers, +// CompressionKind_ZSTD, +// /*scaleWriter=*/false} +// .value); +// } +// } +// } +// return testParams; +// } +// }; +// +// class BucketSortOnlyTableWriterTest +// : public TableWriterTestBase, +// public testing::WithParamInterface { +// public: +// BucketSortOnlyTableWriterTest() : TableWriterTestBase(GetParam()) {} +// +// static std::vector getTestParams() { +// std::vector testParams; +// const std::vector multiDriverOptions = {false, true}; +// std::vector fileFormats = {FileFormat::DWRF}; +// if (hasWriterFactory(FileFormat::PARQUET)) { +// fileFormats.push_back(FileFormat::PARQUET); +// } +// const std::vector bucketModes = { +// TestMode::kBucketed, TestMode::kOnlyBucketed}; +// for (bool multiDrivers : multiDriverOptions) { +// for (FileFormat fileFormat : fileFormats) { +// for (auto bucketMode : bucketModes) { +// testParams.push_back(TestParam{ +// fileFormat, +// bucketMode, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// true, +// multiDrivers, +// facebook::velox::common::CompressionKind_ZSTD, +// /*scaleWriter=*/false} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// bucketMode, +// connector::common::CommitStrategy::kTaskCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// true, +// multiDrivers, +// facebook::velox::common::CompressionKind_NONE, +// /*scaleWriter=*/false} +// .value); +// } +// } +// } +// return testParams; +// } +// }; +// +// class PartitionedWithoutBucketTableWriterTest +// : public TableWriterTestBase, +// public testing::WithParamInterface { +// public: +// PartitionedWithoutBucketTableWriterTest() : TableWriterTestBase(GetParam()) +// {} +// +// static std::vector getTestParams() { +// std::vector testParams; +// const std::vector multiDriverOptions = {false, true}; +// std::vector fileFormats = {FileFormat::DWRF}; +// if (hasWriterFactory(FileFormat::PARQUET)) { +// fileFormats.push_back(FileFormat::PARQUET); +// } +// for (bool multiDrivers : multiDriverOptions) { +// for (FileFormat fileFormat : fileFormats) { +// for (bool scaleWriter : {false, true}) { +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kPartitioned, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kPartitioned, +// connector::common::CommitStrategy::kTaskCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// true, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// } +// } +// } +// return testParams; +// } +// }; +// +// class AllTableWriterTest : public TableWriterTestBase, +// public testing::WithParamInterface { +// public: +// AllTableWriterTest() : TableWriterTestBase(GetParam()) {} +// +// static std::vector getTestParams() { +// std::vector testParams; +// const std::vector multiDriverOptions = {false, true}; +// std::vector fileFormats = {FileFormat::DWRF}; +// if (hasWriterFactory(FileFormat::PARQUET)) { +// fileFormats.push_back(FileFormat::PARQUET); +// } +// for (bool multiDrivers : multiDriverOptions) { +// for (FileFormat fileFormat : fileFormats) { +// for (bool scaleWriter : {false, true}) { +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kUnpartitioned, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kUnpartitioned, +// connector::common::CommitStrategy::kTaskCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kPartitioned, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kPartitioned, +// connector::common::CommitStrategy::kTaskCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kBucketed, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kBucketed, +// connector::common::CommitStrategy::kTaskCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kBucketed, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kPrestoNative, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kBucketed, +// connector::common::CommitStrategy::kTaskCommit, +// HiveBucketProperty::Kind::kPrestoNative, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kOnlyBucketed, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kOnlyBucketed, +// connector::common::CommitStrategy::kTaskCommit, +// HiveBucketProperty::Kind::kHiveCompatible, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kOnlyBucketed, +// connector::common::CommitStrategy::kNoCommit, +// HiveBucketProperty::Kind::kPrestoNative, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// testParams.push_back(TestParam{ +// fileFormat, +// TestMode::kOnlyBucketed, +// connector::common::CommitStrategy::kTaskCommit, +// HiveBucketProperty::Kind::kPrestoNative, +// false, +// multiDrivers, +// CompressionKind_ZSTD, +// scaleWriter} +// .value); +// } +// } +// } +// return testParams; +// } +// }; +// +//// Runs a pipeline with read + filter + project (with substr) + write. +// TEST_P(AllTableWriterTest, scanFilterProjectWrite) { +// auto filePaths = makeFilePaths(5); +// auto vectors = makeVectors(filePaths.size(), 500); +// for (int i = 0; i < filePaths.size(); i++) { +// writeToFile(filePaths[i]->getPath(), vectors[i]); +// } +// +// createDuckDbTable(vectors); +// +// auto outputDirectory = TempDirectoryPath::create(); +// +// auto planBuilder = PlanBuilder(); +// auto project = planBuilder.tableScan(rowType_).filter("c2 <> 0").project( +// {"c0", "c1", "c3", "c5", "c2 + c3", "substr(c5, 1, 1)"}); +// +// auto intputTypes = project.planNode()->outputType()->children(); +// std::vector tableColumnNames = { +// "c0", "c1", "c3", "c5", "c2_plus_c3", "substr_c5"}; +// const auto outputType = +// ROW(std::move(tableColumnNames), std::move(intputTypes)); +// +// auto plan = createInsertPlan( +// project, +// outputType, +// outputDirectory->getPath(), +// partitionedBy_, +// bucketProperty_, +// compressionKind_, +// getNumWriters(), +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// +// assertQueryWithWriterConfigs( +// plan, filePaths, "SELECT count(*) FROM tmp WHERE c2 <> 0"); +// +// // To test the correctness of the generated output, +// // We create a new plan that only read that file and then +// // compare that against a duckDB query that runs the whole query. +// if (partitionedBy_.size() > 0) { +// auto newOutputType = getNonPartitionsColumns(partitionedBy_, outputType); +// assertQuery( +// PlanBuilder().tableScan(newOutputType).planNode(), +// makeHiveConnectorSplits(outputDirectory), +// "SELECT c3, c5, c2 + c3, substr(c5, 1, 1) FROM tmp WHERE c2 <> 0"); +// verifyTableWriterOutput(outputDirectory->getPath(), newOutputType, +// false); +// } else { +// assertQuery( +// PlanBuilder().tableScan(outputType).planNode(), +// makeHiveConnectorSplits(outputDirectory), +// "SELECT c0, c1, c3, c5, c2 + c3, substr(c5, 1, 1) FROM tmp WHERE c2 +// <> 0"); +// verifyTableWriterOutput(outputDirectory->getPath(), outputType, false); +// } +// } +// +// TEST_P(AllTableWriterTest, renameAndReorderColumns) { +// auto filePaths = makeFilePaths(5); +// auto vectors = makeVectors(filePaths.size(), 500); +// for (int i = 0; i < filePaths.size(); ++i) { +// writeToFile(filePaths[i]->getPath(), vectors[i]); +// } +// +// createDuckDbTable(vectors); +// +// auto outputDirectory = TempDirectoryPath::create(); +// +// if (testMode_ == TestMode::kPartitioned || testMode_ == +// TestMode::kBucketed) { +// const std::vector partitionBy = {"x", "y"}; +// setPartitionBy(partitionBy); +// } +// if (testMode_ == TestMode::kBucketed || +// testMode_ == TestMode::kOnlyBucketed) { +// setBucketProperty( +// bucketProperty_->kind(), +// bucketProperty_->bucketCount(), +// {"z", "v"}, +// {REAL(), VARCHAR()}, +// {}); +// } +// +// auto inputRowType = +// ROW({"c2", "c5", "c4", "c1", "c0", "c3"}, +// {SMALLINT(), VARCHAR(), DOUBLE(), INTEGER(), BIGINT(), REAL()}); +// +// setTableSchema( +// ROW({"u", "v", "w", "x", "y", "z"}, +// {SMALLINT(), VARCHAR(), DOUBLE(), INTEGER(), BIGINT(), REAL()})); +// +// auto plan = createInsertPlan( +// PlanBuilder().tableScan(rowType_), +// inputRowType, +// tableSchema_, +// outputDirectory->getPath(), +// partitionedBy_, +// bucketProperty_, +// compressionKind_, +// getNumWriters(), +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// +// assertQueryWithWriterConfigs(plan, filePaths, "SELECT count(*) FROM tmp"); +// +// if (partitionedBy_.size() > 0) { +// auto newOutputType = getNonPartitionsColumns(partitionedBy_, +// tableSchema_); HiveConnectorTestBase::assertQuery( +// PlanBuilder().tableScan(newOutputType).planNode(), +// makeHiveConnectorSplits(outputDirectory), +// "SELECT c2, c5, c4, c3 FROM tmp"); +// +// verifyTableWriterOutput(outputDirectory->getPath(), newOutputType, +// false); +// } else { +// HiveConnectorTestBase::assertQuery( +// PlanBuilder().tableScan(tableSchema_).planNode(), +// makeHiveConnectorSplits(outputDirectory), +// "SELECT c2, c5, c4, c1, c0, c3 FROM tmp"); +// +// verifyTableWriterOutput(outputDirectory->getPath(), tableSchema_, false); +// } +// } +// +//// Runs a pipeline with read + write. +// TEST_P(AllTableWriterTest, directReadWrite) { +// auto filePaths = makeFilePaths(5); +// auto vectors = makeVectors(filePaths.size(), 200); +// for (int i = 0; i < filePaths.size(); i++) { +// writeToFile(filePaths[i]->getPath(), vectors[i]); +// } +// +// createDuckDbTable(vectors); +// +// auto outputDirectory = TempDirectoryPath::create(); +// auto plan = createInsertPlan( +// PlanBuilder().tableScan(rowType_), +// rowType_, +// outputDirectory->getPath(), +// partitionedBy_, +// bucketProperty_, +// compressionKind_, +// getNumWriters(), +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// +// assertQuery(plan, filePaths, "SELECT count(*) FROM tmp"); +// +// // To test the correctness of the generated output, +// // We create a new plan that only read that file and then +// // compare that against a duckDB query that runs the whole query. +// +// if (partitionedBy_.size() > 0) { +// auto newOutputType = getNonPartitionsColumns(partitionedBy_, +// tableSchema_); assertQuery( +// PlanBuilder().tableScan(newOutputType).planNode(), +// makeHiveConnectorSplits(outputDirectory), +// "SELECT c2, c3, c4, c5 FROM tmp"); +// rowType_ = newOutputType; +// verifyTableWriterOutput(outputDirectory->getPath(), rowType_); +// } else { +// assertQuery( +// PlanBuilder().tableScan(rowType_).planNode(), +// makeHiveConnectorSplits(outputDirectory), +// "SELECT * FROM tmp"); +// +// verifyTableWriterOutput(outputDirectory->getPath(), rowType_); +// } +// } +// +//// Tests writing constant vectors. +// TEST_P(AllTableWriterTest, constantVectors) { +// vector_size_t size = 1'000; +// +// // Make constant vectors of various types with null and non-null values. +// auto vector = makeConstantVector(size); +// +// createDuckDbTable({vector}); +// +// auto outputDirectory = TempDirectoryPath::create(); +// auto op = createInsertPlan( +// PlanBuilder().values({vector}), +// rowType_, +// outputDirectory->getPath(), +// partitionedBy_, +// bucketProperty_, +// compressionKind_, +// getNumWriters(), +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// +// assertQuery(op, fmt::format("SELECT {}", size)); +// +// if (partitionedBy_.size() > 0) { +// auto newOutputType = getNonPartitionsColumns(partitionedBy_, +// tableSchema_); assertQuery( +// PlanBuilder().tableScan(newOutputType).planNode(), +// makeHiveConnectorSplits(outputDirectory), +// "SELECT c2, c3, c4, c5 FROM tmp"); +// rowType_ = newOutputType; +// verifyTableWriterOutput(outputDirectory->getPath(), rowType_); +// } else { +// assertQuery( +// PlanBuilder().tableScan(rowType_).planNode(), +// makeHiveConnectorSplits(outputDirectory), +// "SELECT * FROM tmp"); +// +// verifyTableWriterOutput(outputDirectory->getPath(), rowType_); +// } +// } +// +// TEST_P(AllTableWriterTest, emptyInput) { +// auto outputDirectory = TempDirectoryPath::create(); +// auto vector = makeConstantVector(0); +// auto op = createInsertPlan( +// PlanBuilder().values({vector}), +// rowType_, +// outputDirectory->getPath(), +// partitionedBy_, +// bucketProperty_, +// compressionKind_, +// getNumWriters(), +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// +// assertQuery(op, "SELECT 0"); +// } +// +// TEST_P(AllTableWriterTest, commitStrategies) { +// auto filePaths = makeFilePaths(5); +// auto vectors = makeVectors(filePaths.size(), 100); +// +// createDuckDbTable(vectors); +// +// // Test the kTaskCommit commit strategy writing to one dot-prefixed +// // temporary file. +// { +// SCOPED_TRACE(CommitStrategy::kTaskCommit); +// auto outputDirectory = TempDirectoryPath::create(); +// auto plan = createInsertPlan( +// PlanBuilder().values(vectors), +// rowType_, +// outputDirectory->getPath(), +// partitionedBy_, +// bucketProperty_, +// compressionKind_, +// getNumWriters(), +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// +// assertQuery(plan, "SELECT count(*) FROM tmp"); +// +// if (partitionedBy_.size() > 0) { +// auto newOutputType = +// getNonPartitionsColumns(partitionedBy_, tableSchema_); +// assertQuery( +// PlanBuilder().tableScan(newOutputType).planNode(), +// makeHiveConnectorSplits(outputDirectory), +// "SELECT c2, c3, c4, c5 FROM tmp"); +// auto originalRowType = rowType_; +// rowType_ = newOutputType; +// verifyTableWriterOutput(outputDirectory->getPath(), rowType_); +// rowType_ = originalRowType; +// } else { +// assertQuery( +// PlanBuilder().tableScan(rowType_).planNode(), +// makeHiveConnectorSplits(outputDirectory), +// "SELECT * FROM tmp"); +// verifyTableWriterOutput(outputDirectory->getPath(), rowType_); +// } +// } +// // Test kNoCommit commit strategy writing to non-temporary files. +// { +// SCOPED_TRACE(CommitStrategy::kNoCommit); +// auto outputDirectory = TempDirectoryPath::create(); +// setCommitStrategy(CommitStrategy::kNoCommit); +// auto plan = createInsertPlan( +// PlanBuilder().values(vectors), +// rowType_, +// outputDirectory->getPath(), +// partitionedBy_, +// bucketProperty_, +// compressionKind_, +// getNumWriters(), +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// +// assertQuery(plan, "SELECT count(*) FROM tmp"); +// +// if (partitionedBy_.size() > 0) { +// auto newOutputType = +// getNonPartitionsColumns(partitionedBy_, tableSchema_); +// assertQuery( +// PlanBuilder().tableScan(newOutputType).planNode(), +// makeHiveConnectorSplits(outputDirectory), +// "SELECT c2, c3, c4, c5 FROM tmp"); +// rowType_ = newOutputType; +// verifyTableWriterOutput(outputDirectory->getPath(), rowType_); +// } else { +// assertQuery( +// PlanBuilder().tableScan(rowType_).planNode(), +// makeHiveConnectorSplits(outputDirectory), +// "SELECT * FROM tmp"); +// verifyTableWriterOutput(outputDirectory->getPath(), rowType_); +// } +// } +// } +// +// TEST_P(PartitionedTableWriterTest, specialPartitionName) { +// const int32_t numPartitions = 50; +// const int32_t numBatches = 2; +// +// const auto rowType = +// ROW({"c0", "p0", "p1", "c1", "c3", "c5"}, +// {INTEGER(), INTEGER(), VARCHAR(), BIGINT(), REAL(), VARCHAR()}); +// const std::vector partitionKeys = {"p0", "p1"}; +// const std::vector partitionTypes = {INTEGER(), VARCHAR()}; +// +// const std::vector charsToEscape = { +// '"', +// '#', +// '%', +// '\'', +// '*', +// '/', +// ':', +// '=', +// '?', +// '\\', +// '\x7F', +// '{', +// '[', +// ']', +// '^'}; +// ASSERT_GE(numPartitions, charsToEscape.size()); +// std::vector vectors = makeBatches(numBatches, [&](auto) { +// return makeRowVector( +// rowType->names(), +// { +// makeFlatVector( +// numPartitions, [&](auto row) { return row + 100; }), +// makeFlatVector( +// numPartitions, [&](auto row) { return row; }), +// makeFlatVector( +// numPartitions, +// [&](auto row) { +// // special character +// return StringView::makeInline( +// fmt::format("str_{}{}", row, charsToEscape.at(row % +// 15))); +// }), +// makeFlatVector( +// numPartitions, [&](auto row) { return row + 1000; }), +// makeFlatVector( +// numPartitions, [&](auto row) { return row + 33.23; }), +// makeFlatVector( +// numPartitions, +// [&](auto row) { +// return StringView::makeInline( +// fmt::format("bucket_{}", row * 3)); +// }), +// }); +// }); +// createDuckDbTable(vectors); +// +// auto inputFilePaths = makeFilePaths(numBatches); +// for (int i = 0; i < numBatches; i++) { +// writeToFile(inputFilePaths[i]->getPath(), vectors[i]); +// } +// +// auto outputDirectory = TempDirectoryPath::create(); +// auto plan = createInsertPlan( +// PlanBuilder().tableScan(rowType), +// rowType, +// outputDirectory->getPath(), +// partitionKeys, +// bucketProperty_, +// compressionKind_, +// getNumWriters(), +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// +// auto task = assertQuery(plan, inputFilePaths, "SELECT count(*) FROM tmp"); +// +// std::set actualPartitionDirectories = +// getLeafSubdirectories(outputDirectory->getPath()); +// +// std::set expectedPartitionDirectories; +// const std::vector expectedCharsAfterEscape = { +// "%22", +// "%23", +// "%25", +// "%27", +// "%2A", +// "%2F", +// "%3A", +// "%3D", +// "%3F", +// "%5C", +// "%7F", +// "%7B", +// "%5B", +// "%5D", +// "%5E"}; +// for (auto i = 0; i < numPartitions; ++i) { +// // url encoded +// auto partitionName = fmt::format( +// "p0={}/p1=str_{}{}", i, i, expectedCharsAfterEscape.at(i % 15)); +// expectedPartitionDirectories.emplace( +// fs::path(outputDirectory->getPath()) / partitionName); +// } +// EXPECT_EQ(actualPartitionDirectories, expectedPartitionDirectories); +// } +// +// TEST_P(PartitionedTableWriterTest, multiplePartitions) { +// int32_t numPartitions = 50; +// int32_t numBatches = 2; +// +// auto rowType = +// ROW({"c0", "p0", "p1", "c1", "c3", "c5"}, +// {INTEGER(), INTEGER(), VARCHAR(), BIGINT(), REAL(), VARCHAR()}); +// std::vector partitionKeys = {"p0", "p1"}; +// std::vector partitionTypes = {INTEGER(), VARCHAR()}; +// +// std::vector vectors = makeBatches(numBatches, [&](auto) { +// return makeRowVector( +// rowType->names(), +// { +// makeFlatVector( +// numPartitions, [&](auto row) { return row + 100; }), +// makeFlatVector( +// numPartitions, [&](auto row) { return row; }), +// makeFlatVector( +// numPartitions, +// [&](auto row) { +// return StringView::makeInline(fmt::format("str_{}", row)); +// }), +// makeFlatVector( +// numPartitions, [&](auto row) { return row + 1000; }), +// makeFlatVector( +// numPartitions, [&](auto row) { return row + 33.23; }), +// makeFlatVector( +// numPartitions, +// [&](auto row) { +// return StringView::makeInline( +// fmt::format("bucket_{}", row * 3)); +// }), +// }); +// }); +// createDuckDbTable(vectors); +// +// auto inputFilePaths = makeFilePaths(numBatches); +// for (int i = 0; i < numBatches; i++) { +// writeToFile(inputFilePaths[i]->getPath(), vectors[i]); +// } +// +// auto outputDirectory = TempDirectoryPath::create(); +// auto plan = createInsertPlan( +// PlanBuilder().tableScan(rowType), +// rowType, +// outputDirectory->getPath(), +// partitionKeys, +// bucketProperty_, +// compressionKind_, +// getNumWriters(), +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// +// auto task = assertQuery(plan, inputFilePaths, "SELECT count(*) FROM tmp"); +// +// // Verify that there is one partition directory for each partition. +// std::set actualPartitionDirectories = +// getLeafSubdirectories(outputDirectory->getPath()); +// +// std::set expectedPartitionDirectories; +// std::set partitionNames; +// for (auto i = 0; i < numPartitions; i++) { +// auto partitionName = fmt::format("p0={}/p1=str_{}", i, i); +// partitionNames.emplace(partitionName); +// expectedPartitionDirectories.emplace( +// fs::path(outputDirectory->getPath()) / partitionName); +// } +// EXPECT_EQ(actualPartitionDirectories, expectedPartitionDirectories); +// +// // Verify distribution of records in partition directories. +// auto iterPartitionDirectory = actualPartitionDirectories.begin(); +// auto iterPartitionName = partitionNames.begin(); +// auto newOutputType = getNonPartitionsColumns(partitionKeys, rowType); +// while (iterPartitionDirectory != actualPartitionDirectories.end()) { +// assertQuery( +// PlanBuilder().tableScan(newOutputType).planNode(), +// makeHiveConnectorSplits(*iterPartitionDirectory), +// fmt::format( +// "SELECT c0, c1, c3, c5 FROM tmp WHERE {}", +// partitionNameToPredicate(*iterPartitionName, partitionTypes))); +// // In case of unbucketed partitioned table, one single file is written to +// // each partition directory for Hive connector. +// if (testMode_ == TestMode::kPartitioned) { +// ASSERT_EQ(countRecursiveFiles(*iterPartitionDirectory), 1); +// } else { +// ASSERT_GE(countRecursiveFiles(*iterPartitionDirectory), 1); +// } +// +// ++iterPartitionDirectory; +// ++iterPartitionName; +// } +// } +// +// TEST_P(PartitionedTableWriterTest, singlePartition) { +// const int32_t numBatches = 2; +// auto rowType = +// ROW({"c0", "p0", "c3", "c5"}, {VARCHAR(), BIGINT(), REAL(), +// VARCHAR()}); +// std::vector partitionKeys = {"p0"}; +// +// // Partition vector is constant vector. +// std::vector vectors = makeBatches(numBatches, [&](auto) { +// return makeRowVector( +// rowType->names(), +// {makeFlatVector( +// 1'000, +// [&](auto row) { +// return StringView::makeInline(fmt::format("str_{}", row)); +// }), +// makeConstant((int64_t)365, 1'000), +// makeFlatVector(1'000, [&](auto row) { return row + 33.23; }), +// makeFlatVector(1'000, [&](auto row) { +// return StringView::makeInline(fmt::format("bucket_{}", row * 3)); +// })}); +// }); +// createDuckDbTable(vectors); +// +// auto inputFilePaths = makeFilePaths(numBatches); +// for (int i = 0; i < numBatches; i++) { +// writeToFile(inputFilePaths[i]->getPath(), vectors[i]); +// } +// +// auto outputDirectory = TempDirectoryPath::create(); +// const int numWriters = getNumWriters(); +// auto plan = createInsertPlan( +// PlanBuilder().tableScan(rowType), +// rowType, +// outputDirectory->getPath(), +// partitionKeys, +// bucketProperty_, +// compressionKind_, +// numWriters, +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// +// auto task = assertQueryWithWriterConfigs( +// plan, inputFilePaths, "SELECT count(*) FROM tmp"); +// +// std::set partitionDirectories = +// getLeafSubdirectories(outputDirectory->getPath()); +// +// // Verify only a single partition directory is created. +// ASSERT_EQ(partitionDirectories.size(), 1); +// EXPECT_EQ( +// *partitionDirectories.begin(), +// fs::path(outputDirectory->getPath()) / "p0=365"); +// +// // Verify all data is written to the single partition directory. +// auto newOutputType = getNonPartitionsColumns(partitionKeys, rowType); +// assertQuery( +// PlanBuilder().tableScan(newOutputType).planNode(), +// makeHiveConnectorSplits(outputDirectory), +// "SELECT c0, c3, c5 FROM tmp"); +// +// // In case of unbucketed partitioned table, one single file is written to +// // each partition directory for Hive connector. +// if (testMode_ == TestMode::kPartitioned) { +// ASSERT_LE(countRecursiveFiles(*partitionDirectories.begin()), +// numWriters); +// } else { +// ASSERT_GE(countRecursiveFiles(*partitionDirectories.begin()), +// numWriters); +// } +// } +// +// TEST_P(PartitionedWithoutBucketTableWriterTest, +// fromSinglePartitionToMultiple) { +// auto rowType = ROW({"c0", "c1"}, {BIGINT(), BIGINT()}); +// setDataTypes(rowType); +// std::vector partitionKeys = {"c0"}; +// +// // Partition vector is constant vector. +// std::vector vectors; +// // The initial vector has the same partition key value; +// vectors.push_back(makeRowVector( +// rowType->names(), +// {makeFlatVector(1'000, [&](auto /*unused*/) { return 1; }), +// makeFlatVector(1'000, [&](auto row) { return row + 1; })})); +// // The second vector has different partition key value. +// vectors.push_back(makeRowVector( +// rowType->names(), +// {makeFlatVector(1'000, [&](auto row) { return row * 234 % 30; +// }), +// makeFlatVector(1'000, [&](auto row) { return row + 1; })})); +// createDuckDbTable(vectors); +// +// auto outputDirectory = TempDirectoryPath::create(); +// auto plan = createInsertPlan( +// PlanBuilder().values(vectors), +// rowType, +// outputDirectory->getPath(), +// partitionKeys, +// nullptr, +// compressionKind_, +// numTableWriterCount_); +// +// assertQueryWithWriterConfigs(plan, "SELECT count(*) FROM tmp"); +// +// auto newOutputType = getNonPartitionsColumns(partitionKeys, rowType); +// assertQuery( +// PlanBuilder().tableScan(newOutputType).planNode(), +// makeHiveConnectorSplits(outputDirectory), +// "SELECT c1 FROM tmp"); +// } +// +// TEST_P(PartitionedTableWriterTest, maxPartitions) { +// SCOPED_TRACE(testParam_.toString()); +// const int32_t maxPartitions = 100; +// const int32_t numPartitions = +// testMode_ == TestMode::kBucketed ? 1 : maxPartitions + 1; +// if (testMode_ == TestMode::kBucketed) { +// setBucketProperty( +// testParam_.bucketKind(), +// 1000, +// bucketProperty_->bucketedBy(), +// bucketProperty_->bucketedTypes(), +// bucketProperty_->sortedBy()); +// } +// +// auto rowType = ROW({"p0", "c3", "c5"}, {BIGINT(), REAL(), VARCHAR()}); +// std::vector partitionKeys = {"p0"}; +// +// RowVectorPtr vector; +// if (testMode_ == TestMode::kPartitioned) { +// vector = makeRowVector( +// rowType->names(), +// {makeFlatVector(numPartitions, [&](auto row) { return row; +// }), +// makeFlatVector( +// numPartitions, [&](auto row) { return row + 33.23; }), +// makeFlatVector(numPartitions, [&](auto row) { +// return StringView::makeInline(fmt::format("bucket_{}", row * 3)); +// })}); +// } else { +// vector = makeRowVector( +// rowType->names(), +// {makeFlatVector(4'000, [&](auto /*unused*/) { return 0; }), +// makeFlatVector(4'000, [&](auto row) { return row + 33.23; }), +// makeFlatVector(4'000, [&](auto row) { +// return StringView::makeInline(fmt::format("bucket_{}", row * 3)); +// })}); +// }; +// +// auto outputDirectory = TempDirectoryPath::create(); +// auto plan = createInsertPlan( +// PlanBuilder().values({vector}), +// rowType, +// outputDirectory->getPath(), +// partitionKeys, +// bucketProperty_, +// compressionKind_, +// getNumWriters(), +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// +// if (testMode_ == TestMode::kPartitioned) { +// VELOX_ASSERT_THROW( +// AssertQueryBuilder(plan) +// .connectorSessionProperty( +// kHiveConnectorId, +// HiveConfig::kMaxPartitionsPerWritersSession, +// folly::to(maxPartitions)) +// .copyResults(pool()), +// fmt::format( +// "Exceeded limit of {} distinct partitions.", maxPartitions)); +// } else { +// VELOX_ASSERT_THROW( +// AssertQueryBuilder(plan) +// .connectorSessionProperty( +// kHiveConnectorId, +// HiveConfig::kMaxPartitionsPerWritersSession, +// folly::to(maxPartitions)) +// .copyResults(pool()), +// "Exceeded open writer limit"); +// } +// } +// +//// Test TableWriter does not create a file if input is empty. +// TEST_P(AllTableWriterTest, writeNoFile) { +// auto outputDirectory = TempDirectoryPath::create(); +// auto plan = createInsertPlan( +// PlanBuilder().tableScan(rowType_).filter("false"), +// rowType_, +// outputDirectory->getPath()); +// +// auto execute = [&](const std::shared_ptr& plan, +// std::shared_ptr queryCtx) { +// CursorParameters params; +// params.planNode = plan; +// params.queryCtx = queryCtx; +// readCursor(params, [&](TaskCursor* taskCursor) { +// if (taskCursor->noMoreSplits()) { +// return; +// } +// taskCursor->task()->noMoreSplits("0"); +// taskCursor->setNoMoreSplits(); +// }); +// }; +// +// execute(plan, core::QueryCtx::create(executor_.get())); +// ASSERT_TRUE(fs::is_empty(outputDirectory->getPath())); +// } +// +// TEST_P(UnpartitionedTableWriterTest, differentCompression) { +// std::vector compressions{ +// CompressionKind_NONE, +// CompressionKind_ZLIB, +// CompressionKind_SNAPPY, +// CompressionKind_LZO, +// CompressionKind_ZSTD, +// CompressionKind_LZ4, +// CompressionKind_GZIP, +// CompressionKind_MAX}; +// +// for (auto compressionKind : compressions) { +// auto input = makeVectors(10, 10); +// auto outputDirectory = TempDirectoryPath::create(); +// if (compressionKind == CompressionKind_MAX) { +// VELOX_ASSERT_THROW( +// createInsertPlan( +// PlanBuilder().values(input), +// rowType_, +// outputDirectory->getPath(), +// {}, +// nullptr, +// compressionKind, +// numTableWriterCount_, +// connector::common::LocationHandle::TableType::kNew), +// "Unsupported compression type: CompressionKind_MAX"); +// return; +// } +// auto plan = createInsertPlan( +// PlanBuilder().values(input), +// rowType_, +// outputDirectory->getPath(), +// {}, +// nullptr, +// compressionKind, +// numTableWriterCount_, +// connector::common::LocationHandle::TableType::kNew); +// +// // currently we don't support any compression in PARQUET format +// if (fileFormat_ == FileFormat::PARQUET && +// compressionKind != CompressionKind_NONE) { +// continue; +// } +// if (compressionKind == CompressionKind_NONE || +// compressionKind == CompressionKind_ZLIB || +// compressionKind == CompressionKind_ZSTD) { +// auto result = AssertQueryBuilder(plan) +// .config( +// QueryConfig::kTaskWriterCount, +// std::to_string(numTableWriterCount_)) +// .copyResults(pool()); +// assertEqualResults( +// {makeRowVector({makeConstant(100, 1)})}, {result}); +// } else { +// VELOX_ASSERT_THROW( +// AssertQueryBuilder(plan) +// .config( +// QueryConfig::kTaskWriterCount, +// std::to_string(numTableWriterCount_)) +// .copyResults(pool()), +// "Unsupported compression type:"); +// } +// } +// } +// +// TEST_P(UnpartitionedTableWriterTest, runtimeStatsCheck) { +// // The runtime stats test only applies for dwrf file format. +// if (fileFormat_ != dwio::common::FileFormat::DWRF) { +// return; +// } +// struct { +// int numInputVectors; +// std::string maxStripeSize; +// int expectedNumStripes; +// +// std::string debugString() const { +// return fmt::format( +// "numInputVectors: {}, maxStripeSize: {}, expectedNumStripes: {}", +// numInputVectors, +// maxStripeSize, +// expectedNumStripes); +// } +// } testSettings[] = { +// {10, "1GB", 1}, +// {1, "1GB", 1}, +// {2, "1GB", 1}, +// {10, "1B", 10}, +// {2, "1B", 2}, +// {1, "1B", 1}}; +// +// for (const auto& testData : testSettings) { +// SCOPED_TRACE(testData.debugString()); +// auto rowType = ROW({"c0", "c1"}, {VARCHAR(), BIGINT()}); +// +// VectorFuzzer::Options options; +// options.nullRatio = 0.0; +// options.vectorSize = 1; +// options.stringLength = 1L << 20; +// VectorFuzzer fuzzer(options, pool()); +// +// std::vector vectors; +// for (int i = 0; i < testData.numInputVectors; ++i) { +// vectors.push_back(fuzzer.fuzzInputRow(rowType)); +// } +// +// createDuckDbTable(vectors); +// +// auto outputDirectory = TempDirectoryPath::create(); +// auto plan = createInsertPlan( +// PlanBuilder().values(vectors), +// rowType, +// outputDirectory->getPath(), +// {}, +// nullptr, +// compressionKind_, +// 1, +// connector::common::LocationHandle::TableType::kNew); +// const std::shared_ptr task = +// AssertQueryBuilder(plan, duckDbQueryRunner_) +// .config(QueryConfig::kTaskWriterCount, std::to_string(1)) +// .connectorSessionProperty( +// kHiveConnectorId, +// dwrf::Config::kOrcWriterMaxStripeSizeSession, +// testData.maxStripeSize) +// .assertResults("SELECT count(*) FROM tmp"); +// auto stats = task->taskStats().pipelineStats.front().operatorStats; +// if (testData.maxStripeSize == "1GB") { +// ASSERT_GT( +// stats[1].memoryStats.peakTotalMemoryReservation, +// testData.numInputVectors * options.stringLength); +// } +// ASSERT_EQ( +// stats[1].runtimeStats["stripeSize"].count, +// testData.expectedNumStripes); +// ASSERT_EQ(stats[1].runtimeStats[TableWriter::kNumWrittenFiles].sum, 1); +// ASSERT_EQ(stats[1].runtimeStats[TableWriter::kNumWrittenFiles].count, 1); +// ASSERT_GE(stats[1].runtimeStats[TableWriter::kWriteIOTime].sum, 0); +// ASSERT_EQ(stats[1].runtimeStats[TableWriter::kWriteIOTime].count, 1); +// } +// } +// +// TEST_P(UnpartitionedTableWriterTest, immutableSettings) { +// struct { +// connector::common::LocationHandle::TableType dataType; +// bool immutablePartitionsEnabled; +// bool expectedInsertSuccees; +// +// std::string debugString() const { +// return fmt::format( +// "dataType:{}, immutablePartitionsEnabled:{}, operationSuccess:{}", +// dataType, +// immutablePartitionsEnabled, +// expectedInsertSuccees); +// } +// } testSettings[] = { +// {connector::common::LocationHandle::TableType::kNew, true, true}, +// {connector::common::LocationHandle::TableType::kNew, false, true}, +// {connector::common::LocationHandle::TableType::kExisting, true, false}, +// {connector::common::LocationHandle::TableType::kExisting, false, true}}; +// +// for (auto testData : testSettings) { +// SCOPED_TRACE(testData.debugString()); +// std::unordered_map propFromFile{ +// {"hive.immutable-partitions", +// testData.immutablePartitionsEnabled ? "true" : "false"}}; +// std::shared_ptr config{ +// std::make_shared(std::move(propFromFile))}; +// resetHiveConnector(config); +// +// auto input = makeVectors(10, 10); +// auto outputDirectory = TempDirectoryPath::create(); +// auto plan = createInsertPlan( +// PlanBuilder().values(input), +// rowType_, +// outputDirectory->getPath(), +// {}, +// nullptr, +// CompressionKind_NONE, +// numTableWriterCount_, +// testData.dataType); +// +// if (!testData.expectedInsertSuccees) { +// VELOX_ASSERT_THROW( +// AssertQueryBuilder(plan).copyResults(pool()), +// "Unpartitioned Hive tables are immutable."); +// } else { +// auto result = AssertQueryBuilder(plan) +// .config( +// QueryConfig::kTaskWriterCount, +// std::to_string(numTableWriterCount_)) +// .copyResults(pool()); +// assertEqualResults( +// {makeRowVector({makeConstant(100, 1)})}, {result}); +// } +// } +// } +// +// TEST_P(BucketedTableOnlyWriteTest, bucketCountLimit) { +// SCOPED_TRACE(testParam_.toString()); +// auto input = makeVectors(1, 100); +// createDuckDbTable(input); +// struct { +// uint32_t bucketCount; +// bool expectedError; +// +// std::string debugString() const { +// return fmt::format( +// "bucketCount:{} expectedError:{}", bucketCount, expectedError); +// } +// } testSettings[] = { +// {1, false}, +// {3, false}, +// {HiveDataSink::maxBucketCount() - 1, false}, +// {HiveDataSink::maxBucketCount(), true}, +// {HiveDataSink::maxBucketCount() + 1, true}, +// {HiveDataSink::maxBucketCount() * 2, true}}; +// for (const auto& testData : testSettings) { +// SCOPED_TRACE(testData.debugString()); +// auto outputDirectory = TempDirectoryPath::create(); +// setBucketProperty( +// bucketProperty_->kind(), +// testData.bucketCount, +// bucketProperty_->bucketedBy(), +// bucketProperty_->bucketedTypes(), +// bucketProperty_->sortedBy()); +// auto plan = createInsertPlan( +// PlanBuilder().values({input}), +// rowType_, +// outputDirectory->getPath(), +// partitionedBy_, +// bucketProperty_, +// compressionKind_, +// getNumWriters(), +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// if (testData.expectedError) { +// VELOX_ASSERT_THROW( +// AssertQueryBuilder(plan) +// .connectorSessionProperty( +// kHiveConnectorId, +// HiveConfig::kMaxPartitionsPerWritersSession, +// // Make sure we have a sufficient large writer limit. +// folly::to(testData.bucketCount * 2)) +// .copyResults(pool()), +// "bucketCount exceeds the limit"); +// } else { +// assertQueryWithWriterConfigs(plan, "SELECT count(*) FROM tmp"); +// +// if (partitionedBy_.size() > 0) { +// auto newOutputType = +// getNonPartitionsColumns(partitionedBy_, tableSchema_); +// assertQuery( +// PlanBuilder().tableScan(newOutputType).planNode(), +// makeHiveConnectorSplits(outputDirectory), +// "SELECT c2, c3, c4, c5 FROM tmp"); +// auto originalRowType = rowType_; +// rowType_ = newOutputType; +// verifyTableWriterOutput(outputDirectory->getPath(), rowType_); +// rowType_ = originalRowType; +// } else { +// assertQuery( +// PlanBuilder().tableScan(rowType_).planNode(), +// makeHiveConnectorSplits(outputDirectory), +// "SELECT * FROM tmp"); +// verifyTableWriterOutput(outputDirectory->getPath(), rowType_); +// } +// } +// } +// } +// +// TEST_P(BucketedTableOnlyWriteTest, mismatchedBucketTypes) { +// SCOPED_TRACE(testParam_.toString()); +// auto input = makeVectors(1, 100); +// createDuckDbTable(input); +// auto outputDirectory = TempDirectoryPath::create(); +// std::vector badBucketedBy = bucketProperty_->bucketedTypes(); +// const auto oldType = badBucketedBy[0]; +// badBucketedBy[0] = VARCHAR(); +// setBucketProperty( +// bucketProperty_->kind(), +// bucketProperty_->bucketCount(), +// bucketProperty_->bucketedBy(), +// badBucketedBy, +// bucketProperty_->sortedBy()); +// auto plan = createInsertPlan( +// PlanBuilder().values({input}), +// rowType_, +// outputDirectory->getPath(), +// partitionedBy_, +// bucketProperty_, +// compressionKind_, +// getNumWriters(), +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// VELOX_ASSERT_THROW( +// AssertQueryBuilder(plan).copyResults(pool()), +// fmt::format( +// "Input column {} type {} doesn't match bucket type {}", +// bucketProperty_->bucketedBy()[0], +// oldType->toString(), +// bucketProperty_->bucketedTypes()[0])); +// } +// +// TEST_P(AllTableWriterTest, tableWriteOutputCheck) { +// SCOPED_TRACE(testParam_.toString()); +// if (!testParam_.multiDrivers() || +// testParam_.testMode() != TestMode::kUnpartitioned) { +// return; +// } +// auto input = makeVectors(10, 100); +// createDuckDbTable(input); +// auto outputDirectory = TempDirectoryPath::create(); +// auto plan = createInsertPlan( +// PlanBuilder().values({input}), +// rowType_, +// outputDirectory->getPath(), +// partitionedBy_, +// bucketProperty_, +// compressionKind_, +// getNumWriters(), +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_, +// false); +// +// auto result = runQueryWithWriterConfigs(plan); +// auto writtenRowVector = result->childAt(TableWriteTraits::kRowCountChannel) +// ->asFlatVector(); +// auto fragmentVector = result->childAt(TableWriteTraits::kFragmentChannel) +// ->asFlatVector(); +// auto commitContextVector = +// result->childAt(TableWriteTraits::kContextChannel) +// ->asFlatVector(); +// const int64_t expectedRows = 10 * 100; +// std::vector writeFiles; +// int64_t numRows{0}; +// for (int i = 0; i < result->size(); ++i) { +// if (testParam_.multiDrivers()) { +// ASSERT_FALSE(commitContextVector->isNullAt(i)); +// if (!fragmentVector->isNullAt(i)) { +// ASSERT_TRUE(writtenRowVector->isNullAt(i)); +// } +// } else { +// if (i == 0) { +// ASSERT_TRUE(fragmentVector->isNullAt(i)); +// } else { +// ASSERT_TRUE(writtenRowVector->isNullAt(i)); +// ASSERT_FALSE(fragmentVector->isNullAt(i)); +// } +// ASSERT_FALSE(commitContextVector->isNullAt(i)); +// } +// if (!fragmentVector->isNullAt(i)) { +// ASSERT_FALSE(fragmentVector->isNullAt(i)); +// folly::dynamic obj = folly::parseJson(fragmentVector->valueAt(i)); +// if (testMode_ == TestMode::kUnpartitioned) { +// ASSERT_EQ(obj["targetPath"], outputDirectory->getPath()); +// ASSERT_EQ(obj["writePath"], outputDirectory->getPath()); +// } else { +// std::string partitionDirRe; +// for (const auto& partitionBy : partitionedBy_) { +// partitionDirRe += fmt::format("/{}=.+", partitionBy); +// } +// ASSERT_TRUE(RE2::FullMatch( +// obj["targetPath"].asString(), +// fmt::format("{}{}", outputDirectory->getPath(), partitionDirRe))) +// << obj["targetPath"].asString(); +// ASSERT_TRUE(RE2::FullMatch( +// obj["writePath"].asString(), +// fmt::format("{}{}", outputDirectory->getPath(), partitionDirRe))) +// << obj["writePath"].asString(); +// } +// numRows += obj["rowCount"].asInt(); +// ASSERT_EQ(obj["updateMode"].asString(), "NEW"); +// +// ASSERT_TRUE(obj["fileWriteInfos"].isArray()); +// ASSERT_EQ(obj["fileWriteInfos"].size(), 1); +// folly::dynamic writerInfoObj = obj["fileWriteInfos"][0]; +// const std::string writeFileName = +// writerInfoObj["writeFileName"].asString(); +// writeFiles.push_back(writeFileName); +// const std::string targetFileName = +// writerInfoObj["targetFileName"].asString(); +// const std::string writeFileFullPath = +// obj["writePath"].asString() + "/" + writeFileName; +// std::filesystem::path path{writeFileFullPath}; +// const auto actualFileSize = fs::file_size(path); +// ASSERT_EQ(obj["onDiskDataSizeInBytes"].asInt(), actualFileSize); +// ASSERT_GT(obj["inMemoryDataSizeInBytes"].asInt(), 0); +// ASSERT_EQ(writerInfoObj["fileSize"], actualFileSize); +// if (commitStrategy_ == connector::common::CommitStrategy::kNoCommit) { +// ASSERT_EQ(writeFileName, targetFileName); +// } else { +// const std::string kParquetSuffix = ".parquet"; +// if (folly::StringPiece(targetFileName).endsWith(kParquetSuffix)) { +// // Remove the .parquet suffix. +// auto trimmedFilename = targetFileName.substr( +// 0, targetFileName.size() - kParquetSuffix.size()); +// ASSERT_TRUE(writeFileName.find(trimmedFilename) != +// std::string::npos); +// } else { +// ASSERT_TRUE(writeFileName.find(targetFileName) != +// std::string::npos); +// } +// } +// } +// if (!commitContextVector->isNullAt(i)) { +// ASSERT_TRUE(RE2::FullMatch( +// commitContextVector->valueAt(i).getString(), +// fmt::format(".*{}.*", commitStrategyToString(commitStrategy_)))) +// << commitContextVector->valueAt(i); +// } +// } +// ASSERT_EQ(numRows, expectedRows); +// if (testMode_ == TestMode::kUnpartitioned) { +// ASSERT_GT(writeFiles.size(), 0); +// ASSERT_LE(writeFiles.size(), numTableWriterCount_); +// } +// auto diskFiles = listAllFiles(outputDirectory->getPath()); +// std::sort(diskFiles.begin(), diskFiles.end()); +// std::sort(writeFiles.begin(), writeFiles.end()); +// ASSERT_EQ(diskFiles, writeFiles) +// << "\nwrite files: " << folly::join(",", writeFiles) +// << "\ndisk files: " << folly::join(",", diskFiles); +// // Verify the utilities provided by table writer traits. +// ASSERT_EQ(TableWriteTraits::getRowCount(result), 10 * 100); +// auto obj = TableWriteTraits::getTableCommitContext(result); +// ASSERT_EQ( +// obj[TableWriteTraits::kCommitStrategyContextKey], +// commitStrategyToString(commitStrategy_)); +// ASSERT_EQ(obj[TableWriteTraits::klastPageContextKey], true); +// ASSERT_EQ(obj[TableWriteTraits::kLifeSpanContextKey], "TaskWide"); +// } +// +// TEST_P(AllTableWriterTest, columnStatsDataTypes) { +// auto rowType = +// ROW({"c0", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8"}, +// {BIGINT(), +// INTEGER(), +// SMALLINT(), +// REAL(), +// DOUBLE(), +// VARCHAR(), +// BOOLEAN(), +// MAP(DATE(), BIGINT()), +// ARRAY(BIGINT())}); +// setDataTypes(rowType); +// std::vector input; +// input.push_back(makeRowVector( +// rowType_->names(), +// { +// makeFlatVector(1'000, [&](auto row) { return 1; }), +// makeFlatVector(1'000, [&](auto row) { return 1; }), +// makeFlatVector(1'000, [&](auto row) { return row; }), +// makeFlatVector(1'000, [&](auto row) { return row + 33.23; +// }), makeFlatVector(1'000, [&](auto row) { return row +// + 33.23; }), makeFlatVector( +// 1'000, +// [&](auto row) { +// return StringView(std::to_string(row).c_str()); +// }), +// makeFlatVector(1'000, [&](auto row) { return true; }), +// makeMapVector( +// 1'000, +// [](auto /*row*/) { return 5; }, +// [](auto row) { return row; }, +// [](auto row) { return row * 3; }), +// makeArrayVector( +// 1'000, +// [](auto /*row*/) { return 5; }, +// [](auto row) { return row * 3; }), +// })); +// createDuckDbTable(input); +// auto outputDirectory = TempDirectoryPath::create(); +// +// std::vector groupingKeyFields; +// for (int i = 0; i < partitionedBy_.size(); ++i) { +// groupingKeyFields.emplace_back(std::make_shared( +// partitionTypes_.at(i), partitionedBy_.at(i))); +// } +// +// // aggregation node +// core::TypedExprPtr intInputField = +// std::make_shared(SMALLINT(), "c2"); +// auto minCallExpr = std::make_shared( +// SMALLINT(), std::vector{intInputField}, "min"); +// auto maxCallExpr = std::make_shared( +// SMALLINT(), std::vector{intInputField}, "max"); +// auto distinctCountCallExpr = std::make_shared( +// VARCHAR(), +// std::vector{intInputField}, +// "approx_distinct"); +// +// core::TypedExprPtr strInputField = +// std::make_shared(VARCHAR(), "c5"); +// auto maxDataSizeCallExpr = std::make_shared( +// BIGINT(), +// std::vector{strInputField}, +// "max_data_size_for_stats"); +// auto sumDataSizeCallExpr = std::make_shared( +// BIGINT(), +// std::vector{strInputField}, +// "sum_data_size_for_stats"); +// +// core::TypedExprPtr boolInputField = +// std::make_shared(BOOLEAN(), "c6"); +// auto countCallExpr = std::make_shared( +// BIGINT(), std::vector{boolInputField}, "count"); +// auto countIfCallExpr = std::make_shared( +// BIGINT(), std::vector{boolInputField}, "count_if"); +// +// core::TypedExprPtr mapInputField = +// std::make_shared( +// MAP(DATE(), BIGINT()), "c7"); +// auto countMapCallExpr = std::make_shared( +// BIGINT(), std::vector{mapInputField}, "count"); +// auto sumDataSizeMapCallExpr = std::make_shared( +// BIGINT(), +// std::vector{mapInputField}, +// "sum_data_size_for_stats"); +// +// core::TypedExprPtr arrayInputField = +// std::make_shared( +// MAP(DATE(), BIGINT()), "c7"); +// auto countArrayCallExpr = std::make_shared( +// BIGINT(), std::vector{mapInputField}, "count"); +// auto sumDataSizeArrayCallExpr = std::make_shared( +// BIGINT(), +// std::vector{mapInputField}, +// "sum_data_size_for_stats"); +// +// const std::vector aggregateNames = { +// "min", +// "max", +// "approx_distinct", +// "max_data_size_for_stats", +// "sum_data_size_for_stats", +// "count", +// "count_if", +// "count", +// "sum_data_size_for_stats", +// "count", +// "sum_data_size_for_stats", +// }; +// +// auto makeAggregate = [](const auto& callExpr) { +// std::vector rawInputTypes; +// for (const auto& input : callExpr->inputs()) { +// rawInputTypes.push_back(input->type()); +// } +// return core::AggregationNode::Aggregate{ +// callExpr, +// rawInputTypes, +// nullptr, // mask +// {}, // sortingKeys +// {} // sortingOrders +// }; +// }; +// +// std::vector aggregates = { +// makeAggregate(minCallExpr), +// makeAggregate(maxCallExpr), +// makeAggregate(distinctCountCallExpr), +// makeAggregate(maxDataSizeCallExpr), +// makeAggregate(sumDataSizeCallExpr), +// makeAggregate(countCallExpr), +// makeAggregate(countIfCallExpr), +// makeAggregate(countMapCallExpr), +// makeAggregate(sumDataSizeMapCallExpr), +// makeAggregate(countArrayCallExpr), +// makeAggregate(sumDataSizeArrayCallExpr), +// }; +// const auto aggregationNode = std::make_shared( +// core::PlanNodeId(), +// core::AggregationNode::Step::kPartial, +// groupingKeyFields, +// std::vector{}, +// aggregateNames, +// aggregates, +// false, // ignoreNullKeys +// PlanBuilder().values({input}).planNode()); +// +// auto plan = PlanBuilder() +// .values({input}) +// .addNode(addTableWriter( +// rowType_, +// rowType_->names(), +// aggregationNode, +// std::make_shared( +// kHiveConnectorId, +// makeHiveInsertTableHandle( +// rowType_->names(), +// rowType_->children(), +// partitionedBy_, +// nullptr, +// makeLocationHandle(outputDirectory->getPath()))), +// false, +// connector::common::CommitStrategy::kNoCommit)) +// .planNode(); +// +// // the result is in format of : row/fragments/context/[partition]/[stats] +// int nextColumnStatsIndex = 3 + partitionedBy_.size(); +// const RowVectorPtr result = AssertQueryBuilder(plan).copyResults(pool()); +// auto minStatsVector = +// result->childAt(nextColumnStatsIndex++)->asFlatVector(); +// ASSERT_EQ(minStatsVector->valueAt(0), 0); +// const auto maxStatsVector = +// result->childAt(nextColumnStatsIndex++)->asFlatVector(); +// ASSERT_EQ(maxStatsVector->valueAt(0), 999); +// const auto distinctCountStatsVector = +// result->childAt(nextColumnStatsIndex++)->asFlatVector(); +// HashStringAllocator allocator{pool_.get()}; +// DenseHll denseHll{ +// std::string(distinctCountStatsVector->valueAt(0)).c_str(), &allocator}; +// ASSERT_EQ(denseHll.cardinality(), 1000); +// const auto maxDataSizeStatsVector = +// result->childAt(nextColumnStatsIndex++)->asFlatVector(); +// ASSERT_EQ(maxDataSizeStatsVector->valueAt(0), 7); +// const auto sumDataSizeStatsVector = +// result->childAt(nextColumnStatsIndex++)->asFlatVector(); +// ASSERT_EQ(sumDataSizeStatsVector->valueAt(0), 6890); +// const auto countStatsVector = +// result->childAt(nextColumnStatsIndex++)->asFlatVector(); +// ASSERT_EQ(countStatsVector->valueAt(0), 1000); +// const auto countIfStatsVector = +// result->childAt(nextColumnStatsIndex++)->asFlatVector(); +// ASSERT_EQ(countIfStatsVector->valueAt(0), 1000); +// const auto countMapStatsVector = +// result->childAt(nextColumnStatsIndex++)->asFlatVector(); +// ASSERT_EQ(countMapStatsVector->valueAt(0), 1000); +// const auto sumDataSizeMapStatsVector = +// result->childAt(nextColumnStatsIndex++)->asFlatVector(); +// ASSERT_EQ(sumDataSizeMapStatsVector->valueAt(0), 64000); +// const auto countArrayStatsVector = +// result->childAt(nextColumnStatsIndex++)->asFlatVector(); +// ASSERT_EQ(countArrayStatsVector->valueAt(0), 1000); +// const auto sumDataSizeArrayStatsVector = +// result->childAt(nextColumnStatsIndex++)->asFlatVector(); +// ASSERT_EQ(sumDataSizeArrayStatsVector->valueAt(0), 64000); +// } +// +// TEST_P(AllTableWriterTest, columnStats) { +// auto input = makeVectors(1, 100); +// createDuckDbTable(input); +// auto outputDirectory = TempDirectoryPath::create(); +// +// // 1. standard columns +// std::vector output = { +// "numWrittenRows", "fragment", "tableCommitContext"}; +// std::vector types = {BIGINT(), VARBINARY(), VARBINARY()}; +// std::vector groupingKeys; +// // 2. partition columns +// for (int i = 0; i < partitionedBy_.size(); i++) { +// groupingKeys.emplace_back( +// std::make_shared( +// partitionTypes_.at(i), partitionedBy_.at(i))); +// output.emplace_back(partitionedBy_.at(i)); +// types.emplace_back(partitionTypes_.at(i)); +// } +// // 3. stats columns +// output.emplace_back("min"); +// types.emplace_back(BIGINT()); +// const auto writerOutputType = ROW(std::move(output), std::move(types)); +// +// // aggregation node +// auto aggregationNode = generateAggregationNode( +// "c0", +// groupingKeys, +// core::AggregationNode::Step::kPartial, +// PlanBuilder().values({input}).planNode()); +// +// auto plan = PlanBuilder() +// .values({input}) +// .addNode(addTableWriter( +// rowType_, +// rowType_->names(), +// aggregationNode, +// std::make_shared( +// kHiveConnectorId, +// makeHiveInsertTableHandle( +// rowType_->names(), +// rowType_->children(), +// partitionedBy_, +// bucketProperty_, +// makeLocationHandle(outputDirectory->getPath()))), +// false, +// commitStrategy_)) +// .planNode(); +// +// auto result = AssertQueryBuilder(plan).copyResults(pool()); +// auto rowVector = result->childAt(0)->asFlatVector(); +// auto fragmentVector = result->childAt(1)->asFlatVector(); +// auto columnStatsVector = +// result->childAt(3 + partitionedBy_.size())->asFlatVector(); +// +// std::vector writeFiles; +// +// // For partitioned, expected result is as follows: +// // Row Fragment Context partition c1_min_value +// // null null x partition1 0 +// // null null x partition2 10 +// // null null x partition3 15 +// // count null x null null +// // null partition1_update x null null +// // null partition1_update x null null +// // null partition2_update x null null +// // null partition2_update x null null +// // null partition3_update x null null +// // +// // Note that we can have multiple same partition_update, they're for +// // different files, but for stats, we would only have one record for each +// // partition +// // +// // For unpartitioned, expected result is: +// // Row Fragment Context partition c1_min_value +// // null null x 0 +// // count null x null null +// // null update x null null +// +// int countRow = 0; +// while (!columnStatsVector->isNullAt(countRow)) { +// countRow++; +// } +// for (int i = 0; i < result->size(); ++i) { +// if (i < countRow) { +// ASSERT_FALSE(columnStatsVector->isNullAt(i)); +// ASSERT_TRUE(rowVector->isNullAt(i)); +// ASSERT_TRUE(fragmentVector->isNullAt(i)); +// } else if (i == countRow) { +// ASSERT_TRUE(columnStatsVector->isNullAt(i)); +// ASSERT_FALSE(rowVector->isNullAt(i)); +// ASSERT_TRUE(fragmentVector->isNullAt(i)); +// } else { +// ASSERT_TRUE(columnStatsVector->isNullAt(i)); +// ASSERT_TRUE(rowVector->isNullAt(i)); +// ASSERT_FALSE(fragmentVector->isNullAt(i)); +// } +// } +// } +// +// TEST_P(AllTableWriterTest, columnStatsWithTableWriteMerge) { +// auto input = makeVectors(1, 100); +// createDuckDbTable(input); +// auto outputDirectory = TempDirectoryPath::create(); +// +// // 1. standard columns +// std::vector output = { +// "numWrittenRows", "fragment", "tableCommitContext"}; +// std::vector types = {BIGINT(), VARBINARY(), VARBINARY()}; +// std::vector groupingKeys; +// // 2. partition columns +// for (int i = 0; i < partitionedBy_.size(); i++) { +// groupingKeys.emplace_back( +// std::make_shared( +// partitionTypes_.at(i), partitionedBy_.at(i))); +// output.emplace_back(partitionedBy_.at(i)); +// types.emplace_back(partitionTypes_.at(i)); +// } +// // 3. stats columns +// output.emplace_back("min"); +// types.emplace_back(BIGINT()); +// const auto writerOutputType = ROW(std::move(output), std::move(types)); +// +// // aggregation node +// auto aggregationNode = generateAggregationNode( +// "c0", +// groupingKeys, +// core::AggregationNode::Step::kPartial, +// PlanBuilder().values({input}).planNode()); +// +// auto tableWriterPlan = +// PlanBuilder().values({input}).addNode(addTableWriter( +// rowType_, +// rowType_->names(), +// aggregationNode, +// std::make_shared( +// kHiveConnectorId, +// makeHiveInsertTableHandle( +// rowType_->names(), +// rowType_->children(), +// partitionedBy_, +// bucketProperty_, +// makeLocationHandle(outputDirectory->getPath()))), +// false, +// commitStrategy_)); +// +// auto mergeAggregationNode = generateAggregationNode( +// "min", +// groupingKeys, +// core::AggregationNode::Step::kIntermediate, +// std::move(tableWriterPlan.planNode())); +// +// auto finalPlan = tableWriterPlan.capturePlanNodeId(tableWriteNodeId_) +// .localPartition(std::vector{}) +// .tableWriteMerge(std::move(mergeAggregationNode)) +// .planNode(); +// +// auto result = AssertQueryBuilder(finalPlan).copyResults(pool()); +// auto rowVector = result->childAt(0)->asFlatVector(); +// auto fragmentVector = result->childAt(1)->asFlatVector(); +// auto columnStatsVector = +// result->childAt(3 + partitionedBy_.size())->asFlatVector(); +// +// std::vector writeFiles; +// +// // For partitioned, expected result is as follows: +// // Row Fragment Context partition c1_min_value +// // null null x partition1 0 +// // null null x partition2 10 +// // null null x partition3 15 +// // count null x null null +// // null partition1_update x null null +// // null partition1_update x null null +// // null partition2_update x null null +// // null partition2_update x null null +// // null partition3_update x null null +// // +// // Note that we can have multiple same partition_update, they're for +// // different files, but for stats, we would only have one record for each +// // partition +// // +// // For unpartitioned, expected result is: +// // Row Fragment Context partition c1_min_value +// // null null x 0 +// // count null x null null +// // null update x null null +// +// int statsRow = 0; +// while (columnStatsVector->isNullAt(statsRow) && statsRow < result->size()) +// { +// ++statsRow; +// } +// for (int i = 1; i < result->size(); ++i) { +// if (i < statsRow) { +// ASSERT_TRUE(rowVector->isNullAt(i)); +// ASSERT_FALSE(fragmentVector->isNullAt(i)); +// ASSERT_TRUE(columnStatsVector->isNullAt(i)); +// } else if (i < result->size() - 1) { +// ASSERT_TRUE(rowVector->isNullAt(i)); +// ASSERT_TRUE(fragmentVector->isNullAt(i)); +// ASSERT_FALSE(columnStatsVector->isNullAt(i)); +// } else { +// ASSERT_FALSE(rowVector->isNullAt(i)); +// ASSERT_TRUE(fragmentVector->isNullAt(i)); +// ASSERT_TRUE(columnStatsVector->isNullAt(i)); +// } +// } +// } +// +//// TODO: add partitioned table write update mode tests and more failure tests. +// +// TEST_P(AllTableWriterTest, tableWriterStats) { +// const int32_t numBatches = 2; +// auto rowType = +// ROW({"c0", "p0", "c3", "c5"}, {VARCHAR(), BIGINT(), REAL(), VARCHAR()}); +// std::vector partitionKeys = {"p0"}; +// +// VectorFuzzer::Options options; +// options.vectorSize = 1000; +// VectorFuzzer fuzzer(options, pool()); +// // Partition vector is constant vector. +// std::vector vectors = makeBatches(numBatches, [&](auto) { +// return makeRowVector( +// rowType->names(), +// {fuzzer.fuzzFlat(VARCHAR()), +// fuzzer.fuzzConstant(BIGINT()), +// fuzzer.fuzzFlat(REAL()), +// fuzzer.fuzzFlat(VARCHAR())}); +// }); +// createDuckDbTable(vectors); +// +// auto inputFilePaths = makeFilePaths(numBatches); +// for (int i = 0; i < numBatches; i++) { +// writeToFile(inputFilePaths[i]->getPath(), vectors[i]); +// } +// +// auto outputDirectory = TempDirectoryPath::create(); +// const int numWriters = getNumWriters(); +// auto plan = createInsertPlan( +// PlanBuilder().tableScan(rowType), +// rowType, +// outputDirectory->getPath(), +// partitionKeys, +// bucketProperty_, +// compressionKind_, +// numWriters, +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// +// auto task = assertQueryWithWriterConfigs( +// plan, inputFilePaths, "SELECT count(*) FROM tmp"); +// +// // Each batch would create a new partition, numWrittenFiles is same as +// // partition num when not bucketed. When bucketed, it's partitionNum * +// // bucketNum, bucket number is 4 +// const int numWrittenFiles = +// bucketProperty_ == nullptr ? numBatches : numBatches * 4; +// // The size of bytes (ORC_MAGIC_LEN) written when the DWRF writer +// // initializes a file. +// const int32_t ORC_HEADER_LEN{3}; +// const auto fixedWrittenBytes = +// numWrittenFiles * (fileFormat_ == FileFormat::DWRF ? ORC_HEADER_LEN : +// 0); +// +// auto planStats = toPlanStats(task->taskStats()); +// auto& stats = planStats.at(tableWriteNodeId_); +// ASSERT_GT(stats.physicalWrittenBytes, fixedWrittenBytes); +// ASSERT_GT( +// stats.operatorStats.at("TableWrite")->physicalWrittenBytes, +// fixedWrittenBytes); +// ASSERT_EQ( +// stats.operatorStats.at("TableWrite") +// ->customStats.at(TableWriter::kNumWrittenFiles) +// .sum, +// numWrittenFiles); +// ASSERT_GE( +// stats.operatorStats.at("TableWrite") +// ->customStats.at(TableWriter::kWriteIOTime) +// .sum, +// 0); +// ASSERT_GE( +// stats.operatorStats.at("TableWrite") +// ->customStats.at(TableWriter::kRunningWallNanos) +// .sum, +// 0); +//} +// +// DEBUG_ONLY_TEST_P( +// UnpartitionedTableWriterTest, +// fileWriterFlushErrorOnDriverClose) { +// VectorFuzzer::Options options; +// const int batchSize = 1000; +// options.vectorSize = batchSize; +// VectorFuzzer fuzzer(options, pool()); +// const int numBatches = 10; +// std::vector vectors; +// int numRows{0}; +// for (int i = 0; i < numBatches; ++i) { +// numRows += batchSize; +// vectors.push_back(fuzzer.fuzzRow(rowType_)); +// } +// std::atomic writeInputs{0}; +// std::atomic triggerWriterOOM{false}; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::exec::Driver::runInternal::addInput", +// std::function([&](Operator* op) { +// if (op->operatorType() != "TableWrite") { +// return; +// } +// if (++writeInputs != 3) { +// return; +// } +// op->operatorCtx()->task()->requestAbort(); +// triggerWriterOOM = true; +// })); +// SCOPED_TESTVALUE_SET( +// "facebook::velox::memory::MemoryPoolImpl::reserveThreadSafe", +// std::function([&](memory::MemoryPool* pool) { +// const std::string dictPoolRe(".*dictionary"); +// const std::string generalPoolRe(".*general"); +// const std::string compressionPoolRe(".*compression"); +// if (!RE2::FullMatch(pool->name(), dictPoolRe) && +// !RE2::FullMatch(pool->name(), generalPoolRe) && +// !RE2::FullMatch(pool->name(), compressionPoolRe)) { +// return; +// } +// if (!triggerWriterOOM) { +// return; +// } +// VELOX_MEM_POOL_CAP_EXCEEDED("Inject write OOM"); +// })); +// +// auto outputDirectory = TempDirectoryPath::create(); +// auto op = createInsertPlan( +// PlanBuilder().values(vectors), +// rowType_, +// outputDirectory->getPath(), +// partitionedBy_, +// bucketProperty_, +// compressionKind_, +// getNumWriters(), +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// +// VELOX_ASSERT_THROW( +// assertQuery(op, fmt::format("SELECT {}", numRows)), +// "Aborted for external error"); +//} +// +// DEBUG_ONLY_TEST_P(UnpartitionedTableWriterTest, dataSinkAbortError) { +// if (fileFormat_ != FileFormat::DWRF) { +// // NOTE: only test on dwrf writer format as we inject write error in dwrf +// // writer. +// return; +// } +// VectorFuzzer::Options options; +// const int batchSize = 100; +// options.vectorSize = batchSize; +// VectorFuzzer fuzzer(options, pool()); +// auto vector = fuzzer.fuzzInputRow(rowType_); +// +// std::atomic triggerWriterErrorOnce{true}; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::dwrf::Writer::write", +// std::function([&](dwrf::Writer* /*unused*/) { +// if (!triggerWriterErrorOnce.exchange(false)) { +// return; +// } +// VELOX_FAIL("inject writer error"); +// })); +// +// std::atomic triggerAbortErrorOnce{true}; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::connector::hive::HiveDataSink::closeInternal", +// std::function( +// [&](const HiveDataSink* /*unused*/) { +// if (!triggerAbortErrorOnce.exchange(false)) { +// return; +// } +// VELOX_FAIL("inject abort error"); +// })); +// +// auto outputDirectory = TempDirectoryPath::create(); +// auto plan = PlanBuilder() +// .values({vector}) +// .tableWrite(outputDirectory->getPath(), fileFormat_) +// .planNode(); +// VELOX_ASSERT_THROW( +// AssertQueryBuilder(plan).copyResults(pool()), "inject writer error"); +// ASSERT_FALSE(triggerWriterErrorOnce); +// ASSERT_FALSE(triggerAbortErrorOnce); +//} +// +// TEST_P(BucketSortOnlyTableWriterTest, sortWriterSpill) { +// SCOPED_TRACE(testParam_.toString()); +// +// const auto vectors = makeVectors(5, 500); +// createDuckDbTable(vectors); +// +// auto outputDirectory = TempDirectoryPath::create(); +// auto op = createInsertPlan( +// PlanBuilder().values(vectors), +// rowType_, +// outputDirectory->getPath(), +// partitionedBy_, +// bucketProperty_, +// compressionKind_, +// getNumWriters(), +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// +// const auto spillStats = globalSpillStats(); +// auto task = +// assertQueryWithWriterConfigs(op, fmt::format("SELECT {}", 5 * 500), +// true); +// if (partitionedBy_.size() > 0) { +// rowType_ = getNonPartitionsColumns(partitionedBy_, rowType_); +// verifyTableWriterOutput(outputDirectory->getPath(), rowType_); +// } else { +// verifyTableWriterOutput(outputDirectory->getPath(), rowType_); +// } +// +// const auto updatedSpillStats = globalSpillStats(); +// ASSERT_GT(updatedSpillStats.spilledBytes, spillStats.spilledBytes); +// ASSERT_GT(updatedSpillStats.spilledPartitions, +// spillStats.spilledPartitions); auto taskStats = +// toPlanStats(task->taskStats()); auto& stats = +// taskStats.at(tableWriteNodeId_); ASSERT_GT(stats.spilledRows, 0); +// ASSERT_GT(stats.spilledBytes, 0); +// // One spilled partition per each written files. +// const int numWrittenFiles = stats.customStats["numWrittenFiles"].sum; +// ASSERT_GE(stats.spilledPartitions, numWrittenFiles); +// ASSERT_GT(stats.customStats[Operator::kSpillRuns].sum, 0); +// ASSERT_GT(stats.customStats[Operator::kSpillFillTime].sum, 0); +// ASSERT_GT(stats.customStats[Operator::kSpillSortTime].sum, 0); +// ASSERT_GT(stats.customStats[Operator::kSpillExtractVectorTime].sum, 0); +// ASSERT_GT(stats.customStats[Operator::kSpillSerializationTime].sum, 0); +// ASSERT_GT(stats.customStats[Operator::kSpillFlushTime].sum, 0); +// ASSERT_GT(stats.customStats[Operator::kSpillWrites].sum, 0); +// ASSERT_GT(stats.customStats[Operator::kSpillWriteTime].sum, 0); +//} +// +// DEBUG_ONLY_TEST_P(BucketSortOnlyTableWriterTest, outputBatchRows) { +// struct { +// uint32_t maxOutputRows; +// std::string maxOutputBytes; +// int expectedOutputCount; +// +// // TODO: add output size check with spilling enabled +// std::string debugString() const { +// return fmt::format( +// "maxOutputRows: {}, maxOutputBytes: {}, expectedOutputCount: {}", +// maxOutputRows, +// maxOutputBytes, +// expectedOutputCount); +// } +// } testSettings[] = {// we have 4 buckets thus 4 writers. +// {10000, "1000kB", 4}, +// // when maxOutputRows = 1, 1000 rows triggers 1000 +// writes {1, "1kB", 1000}, +// // estimatedRowSize is ~62bytes, when maxOutputSize = 62 +// * +// // 100, 1000 rows triggers ~10 writes +// {10000, "6200B", 12}}; +// +// for (const auto& testData : testSettings) { +// SCOPED_TRACE(testData.debugString()); +// std::atomic_int outputCount{0}; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::dwrf::Writer::write", +// std::function( +// [&](dwrf::Writer* /*unused*/) { ++outputCount; })); +// +// auto rowType = +// ROW({"c0", "p0", "c1", "c3", "c4", "c5"}, +// {VARCHAR(), BIGINT(), INTEGER(), REAL(), DOUBLE(), VARCHAR()}); +// std::vector partitionKeys = {"p0"}; +// +// // Partition vector is constant vector. +// std::vector vectors = makeBatches(1, [&](auto) { +// return makeRowVector( +// rowType->names(), +// {makeFlatVector( +// 1'000, +// [&](auto row) { +// return StringView::makeInline(fmt::format("str_{}", row)); +// }), +// makeConstant((int64_t)365, 1'000), +// makeConstant((int32_t)365, 1'000), +// makeFlatVector(1'000, [&](auto row) { return row + 33.23; +// }), makeFlatVector(1'000, [&](auto row) { return row +// + 33.23; }), makeFlatVector(1'000, [&](auto row) { +// return StringView::makeInline(fmt::format("bucket_{}", row * 3)); +// })}); +// }); +// createDuckDbTable(vectors); +// +// auto outputDirectory = TempDirectoryPath::create(); +// auto plan = createInsertPlan( +// PlanBuilder().values({vectors}), +// rowType, +// outputDirectory->getPath(), +// partitionKeys, +// bucketProperty_, +// compressionKind_, +// 1, +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// const std::shared_ptr task = +// AssertQueryBuilder(plan, duckDbQueryRunner_) +// .config(QueryConfig::kTaskWriterCount, std::to_string(1)) +// .connectorSessionProperty( +// kHiveConnectorId, +// HiveConfig::kSortWriterMaxOutputRowsSession, +// folly::to(testData.maxOutputRows)) +// .connectorSessionProperty( +// kHiveConnectorId, +// HiveConfig::kSortWriterMaxOutputBytesSession, +// folly::to(testData.maxOutputBytes)) +// .assertResults("SELECT count(*) FROM tmp"); +// auto stats = task->taskStats().pipelineStats.front().operatorStats; +// ASSERT_EQ(outputCount, testData.expectedOutputCount); +// } +//} +// +// DEBUG_ONLY_TEST_P(BucketSortOnlyTableWriterTest, yield) { +// auto rowType = +// ROW({"c0", "p0", "c1", "c3", "c4", "c5"}, +// {VARCHAR(), BIGINT(), INTEGER(), REAL(), DOUBLE(), VARCHAR()}); +// std::vector partitionKeys = {"p0"}; +// +// // Partition vector is constant vector. +// std::vector vectors = makeBatches(1, [&](auto) { +// return makeRowVector( +// rowType->names(), +// {makeFlatVector( +// 1'000, +// [&](auto row) { +// return StringView::makeInline(fmt::format("str_{}", row)); +// }), +// makeConstant((int64_t)365, 1'000), +// makeConstant((int32_t)365, 1'000), +// makeFlatVector(1'000, [&](auto row) { return row + 33.23; }), +// makeFlatVector(1'000, [&](auto row) { return row + 33.23; }), +// makeFlatVector(1'000, [&](auto row) { +// return StringView::makeInline(fmt::format("bucket_{}", row * 3)); +// })}); +// }); +// createDuckDbTable(vectors); +// +// struct { +// uint64_t flushTimeSliceLimitMs; +// bool expectedYield; +// +// std::string debugString() const { +// return fmt::format( +// "flushTimeSliceLimitMs: {}, expectedYield: {}", +// flushTimeSliceLimitMs, +// expectedYield); +// } +// } testSettings[] = {{0, false}, {1, true}, {10'000, false}}; +// +// for (const auto& testData : testSettings) { +// SCOPED_TRACE(testData.debugString()); +// std::atomic_bool injectDelayOnce{true}; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::dwrf::Writer::write", +// std::function([&](dwrf::Writer* /*unused*/) { +// if (!injectDelayOnce.exchange(false)) { +// return; +// } +// std::this_thread::sleep_for(std::chrono::seconds(2)); +// })); +// createDuckDbTable(vectors); +// +// auto outputDirectory = TempDirectoryPath::create(); +// auto plan = createInsertPlan( +// PlanBuilder().values({vectors}), +// rowType, +// outputDirectory->getPath(), +// partitionKeys, +// bucketProperty_, +// compressionKind_, +// 1, +// connector::common::LocationHandle::TableType::kNew, +// commitStrategy_); +// const int prevYieldCount = Driver::yieldCount(); +// const std::shared_ptr task = +// AssertQueryBuilder(plan, duckDbQueryRunner_) +// .config(QueryConfig::kTaskWriterCount, std::to_string(1)) +// .connectorSessionProperty( +// kHiveConnectorId, +// HiveConfig::kSortWriterFinishTimeSliceLimitMsSession, +// folly::to(testData.flushTimeSliceLimitMs)) +// .connectorSessionProperty( +// kHiveConnectorId, +// HiveConfig::kSortWriterMaxOutputRowsSession, +// folly::to(100)) +// .connectorSessionProperty( +// kHiveConnectorId, +// HiveConfig::kSortWriterMaxOutputBytesSession, +// folly::to("1KB")) +// .assertResults("SELECT count(*) FROM tmp"); +// auto stats = task->taskStats().pipelineStats.front().operatorStats; +// if (testData.expectedYield) { +// ASSERT_GT(Driver::yieldCount(), prevYieldCount); +// } else { +// ASSERT_EQ(Driver::yieldCount(), prevYieldCount); +// } +// } +//} +// +// VELOX_INSTANTIATE_TEST_SUITE_P( +// TableWriterTest, +// UnpartitionedTableWriterTest, +// testing::ValuesIn(UnpartitionedTableWriterTest::getTestParams())); +// +// VELOX_INSTANTIATE_TEST_SUITE_P( +// TableWriterTest, +// PartitionedTableWriterTest, +// testing::ValuesIn(PartitionedTableWriterTest::getTestParams())); +// +// VELOX_INSTANTIATE_TEST_SUITE_P( +// TableWriterTest, +// BucketedTableOnlyWriteTest, +// testing::ValuesIn(BucketedTableOnlyWriteTest::getTestParams())); +// +// VELOX_INSTANTIATE_TEST_SUITE_P( +// TableWriterTest, +// AllTableWriterTest, +// testing::ValuesIn(AllTableWriterTest::getTestParams())); +// +// VELOX_INSTANTIATE_TEST_SUITE_P( +// TableWriterTest, +// PartitionedWithoutBucketTableWriterTest, +// testing::ValuesIn( +// PartitionedWithoutBucketTableWriterTest::getTestParams())); +// +// VELOX_INSTANTIATE_TEST_SUITE_P( +// TableWriterTest, +// BucketSortOnlyTableWriterTest, +// testing::ValuesIn(BucketSortOnlyTableWriterTest::getTestParams())); +// +// class TableWriterArbitrationTest : public HiveConnectorTestBase { +// protected: +// void SetUp() override { +// HiveConnectorTestBase::SetUp(); +// filesystems::registerLocalFileSystem(); +// if (!isRegisteredVectorSerde()) { +// this->registerVectorSerde(); +// } +// +// rowType_ = ROW( +// {{"c0", INTEGER()}, +// {"c1", INTEGER()}, +// {"c2", VARCHAR()}, +// {"c3", VARCHAR()}}); +// fuzzerOpts_.vectorSize = 1024; +// fuzzerOpts_.nullRatio = 0; +// fuzzerOpts_.stringVariableLength = false; +// fuzzerOpts_.stringLength = 1024; +// fuzzerOpts_.allowLazyVector = false; +// } +// +// folly::Random::DefaultGenerator rng_; +// RowTypePtr rowType_; +// VectorFuzzer::Options fuzzerOpts_; +//}; +// +// DEBUG_ONLY_TEST_F(TableWriterArbitrationTest, reclaimFromTableWriter) { +// VectorFuzzer::Options options; +// const int batchSize = 1'000; +// options.vectorSize = batchSize; +// options.stringVariableLength = false; +// options.stringLength = 1'000; +// VectorFuzzer fuzzer(options, pool()); +// const int numBatches = 20; +// std::vector vectors; +// int numRows{0}; +// for (int i = 0; i < numBatches; ++i) { +// numRows += batchSize; +// vectors.push_back(fuzzer.fuzzRow(rowType_)); +// } +// createDuckDbTable(vectors); +// +// for (bool writerSpillEnabled : {false, true}) { +// { +// SCOPED_TRACE(fmt::format("writerSpillEnabled: {}", writerSpillEnabled)); +// auto queryPool = memory::memoryManager()->addRootPool( +// "reclaimFromTableWriter", kQueryMemoryCapacity); +// auto* arbitrator = memory::memoryManager()->arbitrator(); +// const int numPrevArbitrationFailures = arbitrator->stats().numFailures; +// const int numPrevNonReclaimableAttempts = +// arbitrator->stats().numNonReclaimableAttempts; +// auto queryCtx = core::QueryCtx::create( +// executor_.get(), QueryConfig{{}}, {}, nullptr, +// std::move(queryPool)); +// ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); +// +// std::atomic_int numInputs{0}; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::exec::Driver::runInternal::addInput", +// std::function(([&](Operator* op) { +// if (op->operatorType() != "TableWrite") { +// return; +// } +// // We reclaim memory from table writer connector memory pool which +// // connects to the memory pools inside the hive connector. +// ASSERT_FALSE(op->canReclaim()); +// if (++numInputs != numBatches) { +// return; +// } +// +// const auto fakeAllocationSize = +// kQueryMemoryCapacity - op->pool()->parent()->reservedBytes(); +// if (writerSpillEnabled) { +// auto* buffer = op->pool()->allocate(fakeAllocationSize); +// op->pool()->free(buffer, fakeAllocationSize); +// } else { +// VELOX_ASSERT_THROW( +// op->pool()->allocate(fakeAllocationSize), +// "Exceeded memory pool"); +// } +// }))); +// +// auto spillDirectory = TempDirectoryPath::create(); +// auto outputDirectory = TempDirectoryPath::create(); +// core::PlanNodeId tableWriteNodeId; +// auto writerPlan = +// PlanBuilder() +// .values(vectors) +// .tableWrite(outputDirectory->getPath()) +// .capturePlanNodeId(tableWriteNodeId) +// .project({TableWriteTraits::rowCountColumnName()}) +// .singleAggregation( +// {}, +// {fmt::format( +// "sum({})", TableWriteTraits::rowCountColumnName())}) +// .planNode(); +// { +// auto task = +// AssertQueryBuilder(duckDbQueryRunner_) +// .queryCtx(queryCtx) +// .maxDrivers(1) +// .spillDirectory(spillDirectory->getPath()) +// .config(core::QueryConfig::kSpillEnabled, writerSpillEnabled) +// .config( +// core::QueryConfig::kWriterSpillEnabled, +// writerSpillEnabled) +// // Set 0 file writer flush threshold to always trigger flush +// // in test. +// .config(core::QueryConfig::kWriterFlushThresholdBytes, 0) +// .plan(std::move(writerPlan)) +// .assertResults(fmt::format("SELECT {}", numRows)); +// auto planStats = toPlanStats(task->taskStats()); +// auto& tableWriteStats = +// planStats.at(tableWriteNodeId).operatorStats.at("TableWrite"); +// if (writerSpillEnabled) { +// ASSERT_GT( +// tableWriteStats->customStats +// .at(HiveDataSink::kEarlyFlushedRawBytes) +// .count, +// 0); +// ASSERT_GT( +// tableWriteStats->customStats +// .at(HiveDataSink::kEarlyFlushedRawBytes) +// .sum, +// 0); +// ASSERT_EQ( +// arbitrator->stats().numFailures, numPrevArbitrationFailures); +// } else { +// ASSERT_EQ( +// tableWriteStats->customStats.count( +// HiveDataSink::kEarlyFlushedRawBytes), +// 0); +// ASSERT_EQ( +// arbitrator->stats().numFailures, numPrevArbitrationFailures + +// 1); +// } +// ASSERT_EQ( +// arbitrator->stats().numNonReclaimableAttempts, +// numPrevNonReclaimableAttempts); +// } +// waitForAllTasksToBeDeleted(3'000'000); +// } +// } +//} +// +// DEBUG_ONLY_TEST_F(TableWriterArbitrationTest, reclaimFromSortTableWriter) { +// VectorFuzzer::Options options; +// const int batchSize = 1'000; +// options.vectorSize = batchSize; +// options.stringVariableLength = false; +// options.stringLength = 1'000; +// VectorFuzzer fuzzer(options, pool()); +// const int numBatches = 20; +// std::vector vectors; +// int numRows{0}; +// const auto partitionKeyVector = makeFlatVector( +// batchSize, [&](vector_size_t /*unused*/) { return 0; }); +// for (int i = 0; i < numBatches; ++i) { +// numRows += batchSize; +// vectors.push_back(fuzzer.fuzzInputRow(rowType_)); +// vectors.back()->childAt(0) = partitionKeyVector; +// } +// createDuckDbTable(vectors); +// +// for (bool writerSpillEnabled : {false, true}) { +// { +// SCOPED_TRACE(fmt::format("writerSpillEnabled: {}", writerSpillEnabled)); +// auto queryPool = memory::memoryManager()->addRootPool( +// "reclaimFromSortTableWriter", kQueryMemoryCapacity); +// auto* arbitrator = memory::memoryManager()->arbitrator(); +// const int numPrevArbitrationFailures = arbitrator->stats().numFailures; +// const int numPrevNonReclaimableAttempts = +// arbitrator->stats().numNonReclaimableAttempts; +// auto queryCtx = core::QueryCtx::create( +// executor_.get(), QueryConfig{{}}, {}, nullptr, +// std::move(queryPool)); +// ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); +// +// const auto spillStats = velox::common::globalSpillStats(); +// std::atomic numInputs{0}; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::exec::Driver::runInternal::addInput", +// std::function(([&](Operator* op) { +// if (op->operatorType() != "TableWrite") { +// return; +// } +// // We reclaim memory from table writer connector memory pool which +// // connects to the memory pools inside the hive connector. +// ASSERT_FALSE(op->canReclaim()); +// if (++numInputs != numBatches) { +// return; +// } +// +// const auto fakeAllocationSize = +// kQueryMemoryCapacity - op->pool()->parent()->reservedBytes(); +// if (writerSpillEnabled) { +// auto* buffer = op->pool()->allocate(fakeAllocationSize); +// op->pool()->free(buffer, fakeAllocationSize); +// } else { +// VELOX_ASSERT_THROW( +// op->pool()->allocate(fakeAllocationSize), +// "Exceeded memory pool"); +// } +// }))); +// +// auto spillDirectory = TempDirectoryPath::create(); +// auto outputDirectory = TempDirectoryPath::create(); +// auto writerPlan = +// PlanBuilder() +// .values(vectors) +// .tableWrite( +// outputDirectory->getPath(), +// {"c0"}, +// 4, +// {"c1"}, +// { +// std::make_shared( +// "c2", core::SortOrder{false, false}), +// }) +// .project({TableWriteTraits::rowCountColumnName()}) +// .singleAggregation( +// {}, +// {fmt::format( +// "sum({})", TableWriteTraits::rowCountColumnName())}) +// .planNode(); +// +// AssertQueryBuilder(duckDbQueryRunner_) +// .queryCtx(queryCtx) +// .maxDrivers(1) +// .spillDirectory(spillDirectory->getPath()) +// .config(core::QueryConfig::kSpillEnabled, writerSpillEnabled) +// .config(core::QueryConfig::kWriterSpillEnabled, writerSpillEnabled) +// // Set 0 file writer flush threshold to always trigger flush in +// // test. +// .config(core::QueryConfig::kWriterFlushThresholdBytes, 0) +// .plan(std::move(writerPlan)) +// .assertResults(fmt::format("SELECT {}", numRows)); +// +// ASSERT_EQ( +// arbitrator->stats().numFailures, +// numPrevArbitrationFailures + (writerSpillEnabled ? 0 : 1)); +// ASSERT_EQ( +// arbitrator->stats().numNonReclaimableAttempts, +// numPrevNonReclaimableAttempts); +// +// waitForAllTasksToBeDeleted(3'000'000); +// const auto updatedSpillStats = velox::common::globalSpillStats(); +// if (writerSpillEnabled) { +// ASSERT_GT(updatedSpillStats.spilledBytes, spillStats.spilledBytes); +// ASSERT_GT( +// updatedSpillStats.spilledPartitions, +// spillStats.spilledPartitions); +// } else { +// ASSERT_EQ(updatedSpillStats, spillStats); +// } +// } +// } +//} +// +// DEBUG_ONLY_TEST_F(TableWriterArbitrationTest, writerFlushThreshold) { +// VectorFuzzer::Options options; +// const int batchSize = 1'000; +// options.vectorSize = batchSize; +// options.stringVariableLength = false; +// options.stringLength = 1'000; +// const int numBatches = 20; +// const int numRows = numBatches * batchSize; +// std::vector vectors = +// createVectors(numBatches, rowType_, options); +// createDuckDbTable(vectors); +// +// struct TestParam { +// uint64_t bytesToReserve{0}; +// uint64_t writerFlushThreshold{0}; +// }; +// const std::vector testParams{ +// {0, 0}, {0, 1UL << 30}, {64UL << 20, 1UL << 30}}; +// for (const auto& testParam : testParams) { +// SCOPED_TRACE(fmt::format( +// "bytesToReserve: {}, writerFlushThreshold: {}", +// succinctBytes(testParam.bytesToReserve), +// succinctBytes(testParam.writerFlushThreshold))); +// +// auto queryPool = memory::memoryManager()->addRootPool( +// "writerFlushThreshold", kQueryMemoryCapacity); +// auto* arbitrator = memory::memoryManager()->arbitrator(); +// const int numPrevArbitrationFailures = arbitrator->stats().numFailures; +// const int numPrevNonReclaimableAttempts = +// arbitrator->stats().numNonReclaimableAttempts; +// auto queryCtx = core::QueryCtx::create( +// executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); +// ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); +// +// memory::MemoryPool* compressionPool{nullptr}; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::dwrf::Writer::write", +// std::function([&](dwrf::Writer* writer) { +// if (testParam.bytesToReserve == 0 || compressionPool != nullptr) { +// return; +// } +// compressionPool = &(writer->getContext().getMemoryPool( +// dwrf::MemoryUsageCategory::OUTPUT_STREAM)); +// })); +// +// std::atomic numInputs{0}; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::exec::Driver::runInternal::addInput", +// std::function(([&](Operator* op) { +// if (op->operatorType() != "TableWrite") { +// return; +// } +// if (++numInputs != numBatches) { +// return; +// } +// +// if (testParam.bytesToReserve > 0) { +// ASSERT_TRUE(compressionPool != nullptr); +// compressionPool->maybeReserve(testParam.bytesToReserve); +// } +// +// const auto fakeAllocationSize = +// kQueryMemoryCapacity - op->pool()->parent()->usedBytes(); +// if (testParam.writerFlushThreshold == 0) { +// auto* buffer = op->pool()->allocate(fakeAllocationSize); +// op->pool()->free(buffer, fakeAllocationSize); +// } else { +// VELOX_ASSERT_THROW( +// op->pool()->allocate(fakeAllocationSize), +// "Exceeded memory pool"); +// } +// }))); +// +// auto spillDirectory = TempDirectoryPath::create(); +// auto outputDirectory = TempDirectoryPath::create(); +// auto writerPlan = +// PlanBuilder() +// .values(vectors) +// .tableWrite(outputDirectory->getPath()) +// .project({TableWriteTraits::rowCountColumnName()}) +// .singleAggregation( +// {}, +// {fmt::format( +// "sum({})", TableWriteTraits::rowCountColumnName())}) +// .planNode(); +// +// AssertQueryBuilder(duckDbQueryRunner_) +// .queryCtx(queryCtx) +// .maxDrivers(1) +// .spillDirectory(spillDirectory->getPath()) +// .config(core::QueryConfig::kSpillEnabled, true) +// .config(core::QueryConfig::kWriterSpillEnabled, true) +// .config( +// core::QueryConfig::kWriterFlushThresholdBytes, +// testParam.writerFlushThreshold) +// .plan(std::move(writerPlan)) +// .assertResults(fmt::format("SELECT {}", numRows)); +// +// ASSERT_EQ( +// arbitrator->stats().numFailures, +// numPrevArbitrationFailures + +// (testParam.writerFlushThreshold == 0 ? 0 : 1)); +// // We don't trigger reclaim on a writer if it doesn't meet the writer +// // flush threshold. +// ASSERT_EQ( +// arbitrator->stats().numNonReclaimableAttempts, +// numPrevNonReclaimableAttempts); +// ASSERT_GE(arbitrator->stats().reclaimedUsedBytes, +// testParam.bytesToReserve); waitForAllTasksToBeDeleted(3'000'000); +// queryCtx.reset(); +// } +//} +// +// DEBUG_ONLY_TEST_F( +// TableWriterArbitrationTest, +// reclaimFromNonReclaimableTableWriter) { +// VectorFuzzer::Options options; +// const int batchSize = 1'000; +// options.vectorSize = batchSize; +// options.stringVariableLength = false; +// options.stringLength = 1'000; +// VectorFuzzer fuzzer(options, pool()); +// const int numBatches = 20; +// std::vector vectors; +// int numRows{0}; +// for (int i = 0; i < numBatches; ++i) { +// numRows += batchSize; +// vectors.push_back(fuzzer.fuzzRow(rowType_)); +// } +// +// createDuckDbTable(vectors); +// +// auto queryPool = memory::memoryManager()->addRootPool( +// "reclaimFromNonReclaimableTableWriter", kQueryMemoryCapacity); +// auto* arbitrator = memory::memoryManager()->arbitrator(); +// const int numPrevArbitrationFailures = arbitrator->stats().numFailures; +// const int numPrevNonReclaimableAttempts = +// arbitrator->stats().numNonReclaimableAttempts; +// auto queryCtx = core::QueryCtx::create( +// executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); +// ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); +// +// std::atomic injectFakeAllocationOnce{true}; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::dwrf::Writer::write", +// std::function(([&](dwrf::Writer* writer) { +// if (!injectFakeAllocationOnce.exchange(false)) { +// return; +// } +// auto& pool = writer->getContext().getMemoryPool( +// dwrf::MemoryUsageCategory::GENERAL); +// const auto fakeAllocationSize = +// kQueryMemoryCapacity - pool.reservedBytes(); +// VELOX_ASSERT_THROW( +// pool.allocate(fakeAllocationSize), "Exceeded memory pool"); +// }))); +// +// auto outputDirectory = TempDirectoryPath::create(); +// auto writerPlan = +// PlanBuilder() +// .values(vectors) +// .tableWrite(outputDirectory->getPath()) +// .project({TableWriteTraits::rowCountColumnName()}) +// .singleAggregation( +// {}, +// {fmt::format("sum({})", +// TableWriteTraits::rowCountColumnName())}) +// .planNode(); +// +// const auto spillDirectory = TempDirectoryPath::create(); +// AssertQueryBuilder(duckDbQueryRunner_) +// .queryCtx(queryCtx) +// .maxDrivers(1) +// .spillDirectory(spillDirectory->getPath()) +// .config(core::QueryConfig::kSpillEnabled, true) +// .config(core::QueryConfig::kWriterSpillEnabled, true) +// // Set file writer flush threshold of zero to always trigger flush in +// // test. +// .config(core::QueryConfig::kWriterFlushThresholdBytes, 0) +// // Set large stripe and dictionary size thresholds to avoid writer +// // internal stripe flush. +// .connectorSessionProperty( +// kHiveConnectorId, dwrf::Config::kOrcWriterMaxStripeSizeSession, +// "1GB") +// .connectorSessionProperty( +// kHiveConnectorId, +// dwrf::Config::kOrcWriterMaxDictionaryMemorySession, +// "1GB") +// .plan(std::move(writerPlan)) +// .assertResults(fmt::format("SELECT {}", numRows)); +// +// ASSERT_EQ(arbitrator->stats().numFailures, numPrevArbitrationFailures + 1); +// ASSERT_EQ( +// arbitrator->stats().numNonReclaimableAttempts, +// numPrevNonReclaimableAttempts + 1); +// waitForAllTasksToBeDeleted(); +//} +// +// DEBUG_ONLY_TEST_F( +// TableWriterArbitrationTest, +// arbitrationFromTableWriterWithNoMoreInput) { +// VectorFuzzer::Options options; +// const int batchSize = 1'000; +// options.vectorSize = batchSize; +// options.stringVariableLength = false; +// options.stringLength = 1'000; +// VectorFuzzer fuzzer(options, pool()); +// const int numBatches = 10; +// std::vector vectors; +// int numRows{0}; +// for (int i = 0; i < numBatches; ++i) { +// numRows += batchSize; +// vectors.push_back(fuzzer.fuzzRow(rowType_)); +// } +// +// createDuckDbTable(vectors); +// auto queryPool = memory::memoryManager()->addRootPool( +// "arbitrationFromTableWriterWithNoMoreInput", kQueryMemoryCapacity); +// auto* arbitrator = memory::memoryManager()->arbitrator(); +// const int numPrevArbitrationFailures = arbitrator->stats().numFailures; +// const int numPrevNonReclaimableAttempts = +// arbitrator->stats().numNonReclaimableAttempts; +// const int numPrevReclaimedBytes = arbitrator->stats().reclaimedUsedBytes; +// auto queryCtx = core::QueryCtx::create( +// executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); +// ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); +// +// std::atomic writerNoMoreInput{false}; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::exec::Driver::runInternal::noMoreInput", +// std::function(([&](Operator* op) { +// if (op->operatorType() != "TableWrite") { +// return; +// } +// writerNoMoreInput = true; +// }))); +// +// std::atomic injectGetOutputOnce{true}; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::exec::Driver::runInternal::getOutput", +// std::function(([&](Operator* op) { +// if (op->operatorType() != "TableWrite") { +// return; +// } +// if (!writerNoMoreInput) { +// return; +// } +// if (!injectGetOutputOnce.exchange(false)) { +// return; +// } +// const auto fakeAllocationSize = +// kQueryMemoryCapacity - op->pool()->parent()->reservedBytes(); +// auto* buffer = op->pool()->allocate(fakeAllocationSize); +// op->pool()->free(buffer, fakeAllocationSize); +// }))); +// +// auto outputDirectory = TempDirectoryPath::create(); +// auto writerPlan = +// PlanBuilder() +// .values(vectors) +// .tableWrite(outputDirectory->getPath()) +// .project({TableWriteTraits::rowCountColumnName()}) +// .singleAggregation( +// {}, +// {fmt::format("sum({})", +// TableWriteTraits::rowCountColumnName())}) +// .planNode(); +// +// const auto spillDirectory = TempDirectoryPath::create(); +// AssertQueryBuilder(duckDbQueryRunner_) +// .queryCtx(queryCtx) +// .maxDrivers(1) +// .spillDirectory(spillDirectory->getPath()) +// .config(core::QueryConfig::kSpillEnabled, true) +// .config(core::QueryConfig::kWriterSpillEnabled, true) +// // Set 0 file writer flush threshold to always trigger flush in test. +// .config(core::QueryConfig::kWriterFlushThresholdBytes, 0) +// // Set large stripe and dictionary size thresholds to avoid writer +// // internal stripe flush. +// .connectorSessionProperty( +// kHiveConnectorId, dwrf::Config::kOrcWriterMaxStripeSizeSession, +// "1GB") +// .connectorSessionProperty( +// kHiveConnectorId, +// dwrf::Config::kOrcWriterMaxDictionaryMemorySession, +// "1GB") +// .plan(std::move(writerPlan)) +// .assertResults(fmt::format("SELECT {}", numRows)); +// +// ASSERT_EQ( +// arbitrator->stats().numNonReclaimableAttempts, +// numPrevArbitrationFailures); +// ASSERT_EQ(arbitrator->stats().numFailures, numPrevNonReclaimableAttempts); +// ASSERT_GT(arbitrator->stats().reclaimedUsedBytes, numPrevReclaimedBytes); +// waitForAllTasksToBeDeleted(); +//} +// +// DEBUG_ONLY_TEST_F( +// TableWriterArbitrationTest, +// reclaimFromNonReclaimableSortTableWriter) { +// VectorFuzzer::Options options; +// const int batchSize = 1'000; +// options.vectorSize = batchSize; +// options.stringVariableLength = false; +// options.stringLength = 1'000; +// VectorFuzzer fuzzer(options, pool()); +// const int numBatches = 20; +// std::vector vectors; +// int numRows{0}; +// const auto partitionKeyVector = makeFlatVector( +// batchSize, [&](vector_size_t /*unused*/) { return 0; }); +// for (int i = 0; i < numBatches; ++i) { +// numRows += batchSize; +// vectors.push_back(fuzzer.fuzzInputRow(rowType_)); +// vectors.back()->childAt(0) = partitionKeyVector; +// } +// +// createDuckDbTable(vectors); +// +// auto queryPool = memory::memoryManager()->addRootPool( +// "reclaimFromNonReclaimableSortTableWriter", kQueryMemoryCapacity); +// auto* arbitrator = memory::memoryManager()->arbitrator(); +// const int numPrevArbitrationFailures = arbitrator->stats().numFailures; +// const int numPrevNonReclaimableAttempts = +// arbitrator->stats().numNonReclaimableAttempts; +// auto queryCtx = core::QueryCtx::create( +// executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); +// ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); +// +// std::atomic injectFakeAllocationOnce{true}; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::memory::MemoryPoolImpl::reserveThreadSafe", +// std::function(([&](memory::MemoryPool* pool) +// { +// const std::string re(".*sort"); +// if (!RE2::FullMatch(pool->name(), re)) { +// return; +// } +// const int writerMemoryUsage = 4L << 20; +// if (pool->parent()->reservedBytes() < writerMemoryUsage) { +// return; +// } +// if (!injectFakeAllocationOnce.exchange(false)) { +// return; +// } +// const auto fakeAllocationSize = +// kQueryMemoryCapacity - pool->parent()->reservedBytes(); +// VELOX_ASSERT_THROW( +// pool->allocate(fakeAllocationSize), "Exceeded memory pool"); +// }))); +// +// auto outputDirectory = TempDirectoryPath::create(); +// auto writerPlan = +// PlanBuilder() +// .values(vectors) +// .tableWrite( +// outputDirectory->getPath(), +// {"c0"}, +// 4, +// {"c1"}, +// { +// std::make_shared( +// "c2", core::SortOrder{false, false}), +// }) +// .project({TableWriteTraits::rowCountColumnName()}) +// .singleAggregation( +// {}, +// {fmt::format("sum({})", +// TableWriteTraits::rowCountColumnName())}) +// .planNode(); +// +// const auto spillStats = velox::common::globalSpillStats(); +// const auto spillDirectory = TempDirectoryPath::create(); +// AssertQueryBuilder(duckDbQueryRunner_) +// .queryCtx(queryCtx) +// .maxDrivers(1) +// .spillDirectory(spillDirectory->getPath()) +// .config(core::QueryConfig::kSpillEnabled, "true") +// .config(core::QueryConfig::kWriterSpillEnabled, "true") +// // Set file writer flush threshold of zero to always trigger flush in +// // test. +// .config(core::QueryConfig::kWriterFlushThresholdBytes, "0") +// // Set large stripe and dictionary size thresholds to avoid writer +// // internal stripe flush. +// .connectorSessionProperty( +// kHiveConnectorId, dwrf::Config::kOrcWriterMaxStripeSizeSession, +// "1GB") +// .connectorSessionProperty( +// kHiveConnectorId, +// dwrf::Config::kOrcWriterMaxDictionaryMemorySession, +// "1GB") +// .plan(std::move(writerPlan)) +// .assertResults(fmt::format("SELECT {}", numRows)); +// +// ASSERT_EQ(arbitrator->stats().numFailures, numPrevArbitrationFailures + 1); +// ASSERT_EQ( +// arbitrator->stats().numNonReclaimableAttempts, +// numPrevNonReclaimableAttempts + 1); +// const auto updatedSpillStats = velox::common::globalSpillStats(); +// ASSERT_EQ(updatedSpillStats, spillStats); +// waitForAllTasksToBeDeleted(); +//} +// +// DEBUG_ONLY_TEST_F(TableWriterArbitrationTest, tableFileWriteError) { +// VectorFuzzer::Options options; +// const int batchSize = 1'000; +// options.vectorSize = batchSize; +// options.stringVariableLength = false; +// options.stringLength = 1'000; +// VectorFuzzer fuzzer(options, pool()); +// const int numBatches = 20; +// std::vector vectors; +// for (int i = 0; i < numBatches; ++i) { +// vectors.push_back(fuzzer.fuzzRow(rowType_)); +// } +// +// createDuckDbTable(vectors); +// +// auto queryPool = memory::memoryManager()->addRootPool( +// "tableFileWriteError", kQueryMemoryCapacity); +// auto queryCtx = core::QueryCtx::create( +// executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); +// ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); +// +// std::atomic_bool injectWriterErrorOnce{true}; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::dwrf::Writer::write", +// std::function(([&](dwrf::Writer* writer) { +// auto& context = writer->getContext(); +// auto& pool = +// context.getMemoryPool(dwrf::MemoryUsageCategory::OUTPUT_STREAM); +// if (static_cast(&pool) +// ->testingMinReservationBytes() == 0) { +// return; +// } +// if (!injectWriterErrorOnce.exchange(false)) { +// return; +// } +// VELOX_FAIL("inject writer error"); +// }))); +// +// const auto spillDirectory = TempDirectoryPath::create(); +// const auto outputDirectory = TempDirectoryPath::create(); +// auto writerPlan = PlanBuilder() +// .values(vectors) +// .tableWrite(outputDirectory->getPath()) +// .planNode(); +// VELOX_ASSERT_THROW( +// AssertQueryBuilder(duckDbQueryRunner_) +// .queryCtx(queryCtx) +// .maxDrivers(1) +// .spillDirectory(spillDirectory->getPath()) +// .config(core::QueryConfig::kSpillEnabled, true) +// .config(core::QueryConfig::kWriterSpillEnabled, true) +// // Set 0 file writer flush threshold to always reclaim memory from +// // file writer. +// .config(core::QueryConfig::kWriterFlushThresholdBytes, 0) +// // Set stripe size to extreme large to avoid writer internal +// // triggered flush. +// .connectorSessionProperty( +// kHiveConnectorId, +// dwrf::Config::kOrcWriterMaxStripeSizeSession, +// "1GB") +// .connectorSessionProperty( +// kHiveConnectorId, +// dwrf::Config::kOrcWriterMaxDictionaryMemorySession, +// "1GB") +// .plan(std::move(writerPlan)) +// .copyResults(pool()), +// "inject writer error"); +// +// waitForAllTasksToBeDeleted(); +//} +// +// DEBUG_ONLY_TEST_F(TableWriterArbitrationTest, tableWriteSpillUseMoreMemory) { +// // Create a large number of vectors to trigger writer spill. +// fuzzerOpts_.vectorSize = 1000; +// fuzzerOpts_.stringLength = 2048; +// fuzzerOpts_.stringVariableLength = false; +// VectorFuzzer fuzzer(fuzzerOpts_, pool()); +// +// std::vector vectors; +// for (int i = 0; i < 10; ++i) { +// vectors.push_back(fuzzer.fuzzInputRow(rowType_)); +// } +// +// auto queryPool = memory::memoryManager()->addRootPool( +// "tableWriteSpillUseMoreMemory", kQueryMemoryCapacity / 4); +// auto queryCtx = core::QueryCtx::create( +// executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); +// ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity / 4); +// +// auto fakeLeafPool = queryCtx->pool()->addLeafChild( +// "fakeLeaf", true, FakeMemoryReclaimer::create()); +// const int fakeAllocationSize = kQueryMemoryCapacity * 3 / 16; +// TestAllocation injectedFakeAllocation{ +// fakeLeafPool.get(), +// fakeLeafPool->allocate(fakeAllocationSize), +// fakeAllocationSize}; +// +// TestAllocation injectedWriterAllocation; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::dwrf::Writer::flushInternal", +// std::function(([&](dwrf::Writer* writer) { +// ASSERT_TRUE(memory::underMemoryArbitration()); +// injectedFakeAllocation.free(); +// auto& pool = writer->getContext().getMemoryPool( +// dwrf::MemoryUsageCategory::GENERAL); +// injectedWriterAllocation.pool = &pool; +// injectedWriterAllocation.size = kQueryMemoryCapacity / 8; +// injectedWriterAllocation.buffer = +// pool.allocate(injectedWriterAllocation.size); +// }))); +// +// // Free the extra fake memory allocations to make memory pool state +// // consistent at the end of test. +// std::atomic_bool clearAllocationOnce{true}; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::exec::Task::setError", +// std::function(([&](Task* task) { +// if (!clearAllocationOnce.exchange(false)) { +// return; +// } +// ASSERT_EQ(injectedWriterAllocation.size, kQueryMemoryCapacity / 8); +// injectedWriterAllocation.free(); +// }))); +// +// const auto spillDirectory = TempDirectoryPath::create(); +// const auto outputDirectory = TempDirectoryPath::create(); +// auto writerPlan = PlanBuilder() +// .values(vectors) +// .tableWrite(outputDirectory->getPath()) +// .planNode(); +// VELOX_ASSERT_THROW( +// AssertQueryBuilder(duckDbQueryRunner_) +// .queryCtx(queryCtx) +// .maxDrivers(1) +// .spillDirectory(spillDirectory->getPath()) +// .config(core::QueryConfig::kSpillEnabled, true) +// .config(core::QueryConfig::kWriterSpillEnabled, true) +// // Set 0 file writer flush threshold to always trigger flush in +// // test. +// .config(core::QueryConfig::kWriterFlushThresholdBytes, 0) +// // Set stripe size to extreme large to avoid writer internal +// // triggered flush. +// .connectorSessionProperty( +// kHiveConnectorId, +// dwrf::Config::kOrcWriterMaxStripeSizeSession, +// "1GB") +// .connectorSessionProperty( +// kHiveConnectorId, +// dwrf::Config::kOrcWriterMaxDictionaryMemorySession, +// "1GB") +// .plan(std::move(writerPlan)) +// .copyResults(pool()), +// ""); +// +// waitForAllTasksToBeDeleted(); +//} +// +// DEBUG_ONLY_TEST_F(TableWriterArbitrationTest, tableWriteReclaimOnClose) { +// // Create a large number of vectors to trigger writer spill. +// fuzzerOpts_.vectorSize = 1000; +// fuzzerOpts_.stringLength = 1024; +// fuzzerOpts_.stringVariableLength = false; +// VectorFuzzer fuzzer(fuzzerOpts_, pool()); +// std::vector vectors; +// int numRows{0}; +// for (int i = 0; i < 10; ++i) { +// vectors.push_back(fuzzer.fuzzInputRow(rowType_)); +// numRows += vectors.back()->size(); +// } +// +// auto queryPool = memory::memoryManager()->addRootPool( +// "tableWriteSpillUseMoreMemory", kQueryMemoryCapacity); +// auto queryCtx = core::QueryCtx::create( +// executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); +// ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); +// +// auto fakeQueryPool = +// memory::memoryManager()->addRootPool("fake", kQueryMemoryCapacity); +// auto fakeQueryCtx = core::QueryCtx::create( +// executor_.get(), QueryConfig{{}}, {}, nullptr, +// std::move(fakeQueryPool)); +// ASSERT_EQ(fakeQueryCtx->pool()->capacity(), kQueryMemoryCapacity); +// +// auto fakeLeafPool = fakeQueryCtx->pool()->addLeafChild( +// "fakeLeaf", true, FakeMemoryReclaimer::create()); +// +// std::atomic_bool writerNoMoreInput{false}; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::exec::Driver::runInternal::noMoreInput", +// std::function(([&](Operator* op) { +// if (op->operatorType() != "TableWrite") { +// return; +// } +// writerNoMoreInput = true; +// }))); +// +// std::atomic maybeReserveInjectOnce{true}; +// TestAllocation fakeAllocation; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::common::memory::MemoryPoolImpl::maybeReserve", +// std::function([&](memory::MemoryPool* pool) { +// if (!writerNoMoreInput) { +// return; +// } +// if (!maybeReserveInjectOnce.exchange(false)) { +// return; +// } +// // The injection memory allocation to cause maybeReserve on writer +// // close to trigger memory arbitration. The latter tries to reclaim +// // memory from this file writer. +// const size_t injectAllocationSize = kQueryMemoryCapacity; +// fakeAllocation = TestAllocation{ +// .pool = fakeLeafPool.get(), +// .buffer = fakeLeafPool->allocate(injectAllocationSize), +// .size = injectAllocationSize}; +// })); +// +// SCOPED_TESTVALUE_SET( +// "facebook::velox::dwrf::Writer::flushStripe", +// std::function( +// [&](dwrf::Writer* writer) { fakeAllocation.free(); })); +// +// const auto spillDirectory = TempDirectoryPath::create(); +// const auto outputDirectory = TempDirectoryPath::create(); +// auto writerPlan = +// PlanBuilder() +// .values(vectors) +// .tableWrite(outputDirectory->getPath()) +// .singleAggregation( +// {}, +// {fmt::format("sum({})", +// TableWriteTraits::rowCountColumnName())}) +// .planNode(); +// +// AssertQueryBuilder(duckDbQueryRunner_) +// .queryCtx(queryCtx) +// .maxDrivers(1) +// .spillDirectory(spillDirectory->getPath()) +// .config(core::QueryConfig::kSpillEnabled, true) +// .config(core::QueryConfig::kWriterSpillEnabled, true) +// // Set 0 file writer flush threshold to always trigger flush in test. +// .config(core::QueryConfig::kWriterFlushThresholdBytes, 0) +// // Set stripe size to extreme large to avoid writer internal triggered +// // flush. +// .connectorSessionProperty( +// kHiveConnectorId, dwrf::Config::kOrcWriterMaxStripeSizeSession, +// "1GB") +// .connectorSessionProperty( +// kHiveConnectorId, +// dwrf::Config::kOrcWriterMaxDictionaryMemorySession, +// "1GB") +// .plan(std::move(writerPlan)) +// .assertResults(fmt::format("SELECT {}", numRows)); +// +// waitForAllTasksToBeDeleted(); +//} +// +// DEBUG_ONLY_TEST_F( +// TableWriterArbitrationTest, +// raceBetweenWriterCloseAndTaskReclaim) { +// const uint64_t memoryCapacity = 512 * MB; +// std::vector vectors = +// createVectors(rowType_, memoryCapacity / 8, fuzzerOpts_); +// const auto expectedResult = +// runWriteTask(vectors, nullptr, false, 1, pool(), kHiveConnectorId, +// false) +// .data; +// auto queryPool = memory::memoryManager()->addRootPool( +// "tableWriteSpillUseMoreMemory", kQueryMemoryCapacity); +// auto queryCtx = core::QueryCtx::create( +// executor_.get(), QueryConfig{{}}, {}, nullptr, std::move(queryPool)); +// ASSERT_EQ(queryCtx->pool()->capacity(), kQueryMemoryCapacity); +// +// std::atomic_bool writerCloseWaitFlag{true}; +// folly::EventCount writerCloseWait; +// std::atomic_bool taskReclaimWaitFlag{true}; +// folly::EventCount taskReclaimWait; +// SCOPED_TESTVALUE_SET( +// "facebook::velox::dwrf::Writer::flushStripe", +// std::function(([&](dwrf::Writer* writer) { +// writerCloseWaitFlag = false; +// writerCloseWait.notifyAll(); +// taskReclaimWait.await([&]() { return !taskReclaimWaitFlag.load(); }); +// }))); +// +// SCOPED_TESTVALUE_SET( +// "facebook::velox::exec::Task::requestPauseLocked", +// std::function(([&](Task* /*unused*/) { +// taskReclaimWaitFlag = false; +// taskReclaimWait.notifyAll(); +// }))); +// +// std::thread queryThread([&]() { +// const auto result = runWriteTask( +// vectors, +// queryCtx, +// false, +// 1, +// pool(), +// kHiveConnectorId, +// true, +// expectedResult); +// }); +// +// writerCloseWait.await([&]() { return !writerCloseWaitFlag.load(); }); +// +// memory::testingRunArbitration(); +// +// queryThread.join(); +// waitForAllTasksToBeDeleted(); +//} +//} // namespace velox::exec::test diff --git a/velox/exec/tests/UnnestTest.cpp b/velox/exec/tests/UnnestTest.cpp index a3bc2b20b8f2..4fd1c9083d30 100644 --- a/velox/exec/tests/UnnestTest.cpp +++ b/velox/exec/tests/UnnestTest.cpp @@ -72,7 +72,7 @@ TEST_P(UnnestTest, arrayWithOrdinality) { {{{1, 2, std::nullopt, 4}}, std::nullopt, {{5, 6}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{{{std::nullopt}}}}, {{7, 8, 9}}}); auto vector = makeRowVector( diff --git a/velox/exec/tests/VeloxIn10MinDemo.cpp b/velox/exec/tests/VeloxIn10MinDemo.cpp index 87d571a1788c..1d859a9ef085 100644 --- a/velox/exec/tests/VeloxIn10MinDemo.cpp +++ b/velox/exec/tests/VeloxIn10MinDemo.cpp @@ -47,24 +47,24 @@ class VeloxIn10MinDemo : public VectorTestBase { // Register type resolver with DuckDB SQL parser. parse::registerTypeResolver(); - // Register the TPC-H Connector Factory. - connector::registerConnectorFactory( + // Register the TPC-H connector::common::Connector Factory. + connector::common::registerConnectorFactory( std::make_shared()); // Create and register a TPC-H connector. auto tpchConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::tpch::TpchConnectorFactory::kTpchConnectorName) ->newConnector( kTpchConnectorId, std::make_shared( std::unordered_map())); - connector::registerConnector(tpchConnector); + connector::common::registerConnector(tpchConnector); } ~VeloxIn10MinDemo() { - connector::unregisterConnector(kTpchConnectorId); - connector::unregisterConnectorFactory( + connector::common::unregisterConnector(kTpchConnectorId); + connector::common::unregisterConnectorFactory( connector::tpch::TpchConnectorFactory::kTpchConnectorName); } diff --git a/velox/exec/tests/WindowTest.cpp b/velox/exec/tests/WindowTest.cpp index c2e84f9b0f32..6642d82673f1 100644 --- a/velox/exec/tests/WindowTest.cpp +++ b/velox/exec/tests/WindowTest.cpp @@ -39,13 +39,13 @@ class WindowTest : public OperatorTestBase { filesystems::registerLocalFileSystem(); } - common::SpillConfig getSpillConfig( + velox::common::SpillConfig getSpillConfig( const std::string& spillDir, bool enablePrefixSort) const { const auto prefixSortConfig = enablePrefixSort - ? std::optional(common::PrefixSortConfig()) + ? std::optional(velox::common::PrefixSortConfig()) : std::nullopt; - return common::SpillConfig( + return velox::common::SpillConfig( [spillDir]() -> const std::string& { return spillDir; }, [&](uint64_t) {}, "0.0.0", @@ -653,7 +653,7 @@ DEBUG_ONLY_TEST_F(WindowTest, reserveMemorySort) { auto spillDirectory = exec::test::TempDirectoryPath::create(); auto spillConfig = getSpillConfig(spillDirectory->getPath(), enableSpillPrefixSort); - folly::Synchronized spillStats; + folly::Synchronized spillStats; const auto plan = usePrefixSort ? prefixSortPlan : nonPrefixSortPlan; velox::common::PrefixSortConfig prefixSortConfig = velox::common::PrefixSortConfig{ diff --git a/velox/exec/tests/utils/AssertQueryBuilder.cpp b/velox/exec/tests/utils/AssertQueryBuilder.cpp index 786cbad39726..2bb757e6fd8c 100644 --- a/velox/exec/tests/utils/AssertQueryBuilder.cpp +++ b/velox/exec/tests/utils/AssertQueryBuilder.cpp @@ -146,21 +146,21 @@ AssertQueryBuilder& AssertQueryBuilder::splits( } AssertQueryBuilder& AssertQueryBuilder::split( - const std::shared_ptr& connectorSplit) { + const std::shared_ptr& connectorSplit) { split(getOnlyLeafPlanNodeId(params_.planNode), connectorSplit); return *this; } AssertQueryBuilder& AssertQueryBuilder::split( const core::PlanNodeId& planNodeId, - const std::shared_ptr& connectorSplit) { + const std::shared_ptr& connectorSplit) { splits_[planNodeId].emplace_back( exec::Split(folly::copy(connectorSplit), -1)); return *this; } AssertQueryBuilder& AssertQueryBuilder::splits( - const std::vector>& + const std::vector>& connectorSplits) { splits(getOnlyLeafPlanNodeId(params_.planNode), connectorSplits); return *this; @@ -168,7 +168,7 @@ AssertQueryBuilder& AssertQueryBuilder::splits( AssertQueryBuilder& AssertQueryBuilder::splits( const core::PlanNodeId& planNodeId, - const std::vector>& + const std::vector>& connectorSplits) { std::vector splits; for (auto& connectorSplit : connectorSplits) { diff --git a/velox/exec/tests/utils/AssertQueryBuilder.h b/velox/exec/tests/utils/AssertQueryBuilder.h index 404fad20dda4..7223c6853d22 100644 --- a/velox/exec/tests/utils/AssertQueryBuilder.h +++ b/velox/exec/tests/utils/AssertQueryBuilder.h @@ -98,23 +98,23 @@ class AssertQueryBuilder { /// Add a single connector split to the only leaf plan node. Throws if there /// are multiple leaf nodes. AssertQueryBuilder& split( - const std::shared_ptr& connectorSplit); + const std::shared_ptr& connectorSplit); /// Add a single connector split for the specified plan node. AssertQueryBuilder& split( const core::PlanNodeId& planNodeId, - const std::shared_ptr& connectorSplit); + const std::shared_ptr& connectorSplit); /// Add multiple connector splits for the specified plan node. AssertQueryBuilder& splits( const core::PlanNodeId& planNodeId, - const std::vector>& + const std::vector>& connectorSplits); /// Add multiple connector splits to the only leaf plan node. Throws if there /// are multiple leaf nodes. AssertQueryBuilder& splits( - const std::vector>& + const std::vector>& connectorSplits); /// Sets the QueryCtx. diff --git a/velox/exec/tests/utils/CMakeLists.txt b/velox/exec/tests/utils/CMakeLists.txt index 9db2a332a9bd..491a6c3e4d3b 100644 --- a/velox/exec/tests/utils/CMakeLists.txt +++ b/velox/exec/tests/utils/CMakeLists.txt @@ -68,7 +68,8 @@ if(${VELOX_BUILD_RUNNER}) velox_exec_test_lib velox_exec velox_file_test_utils - velox_hive_connector + velox_connector_common +# velox_hive_connector velox_tpch_connector velox_local_runner) diff --git a/velox/exec/tests/utils/HashJoinTestBase.h b/velox/exec/tests/utils/HashJoinTestBase.h index b063389cc607..e7c0acc07b1c 100644 --- a/velox/exec/tests/utils/HashJoinTestBase.h +++ b/velox/exec/tests/utils/HashJoinTestBase.h @@ -50,10 +50,10 @@ using SplitInput = std::unordered_map>; // Returns aggregated spilled stats by build and probe operators from 'task'. -std::pair taskSpilledStats( +std::pair taskSpilledStats( const exec::Task& task) { - common::SpillStats buildStats; - common::SpillStats probeStats; + velox::common::SpillStats buildStats; + velox::common::SpillStats probeStats; auto stats = task.taskStats(); for (auto& pipeline : stats.pipelineStats) { for (auto op : pipeline.operatorStats) { diff --git a/velox/exec/tests/utils/HiveConnectorTestBase.cpp b/velox/exec/tests/utils/HiveConnectorTestBase.cpp index 42b06df0fe32..777455d9a47b 100644 --- a/velox/exec/tests/utils/HiveConnectorTestBase.cpp +++ b/velox/exec/tests/utils/HiveConnectorTestBase.cpp @@ -16,43 +16,50 @@ #include "velox/exec/tests/utils/HiveConnectorTestBase.h" +#include "velox/connectors/common/ConnectorNames.h" #include "velox/exec/tests/utils/AssertQueryBuilder.h" namespace facebook::velox::exec::test { +void HiveConnectorTestBase::HiveConnectorTestBase() {} + void HiveConnectorTestBase::SetUp() { OperatorTestBase::SetUp(); - connector::registerConnectorFactory( - std::make_shared()); - auto hiveConnector = - connector::getConnectorFactory( - connector::hive::HiveConnectorFactory::kHiveConnectorName) - ->newConnector( - kHiveConnectorId, - std::make_shared( - std::unordered_map()), - ioExecutor_.get()); - connector::registerConnector(hiveConnector); + // connector::common::registerConnectorFactory( + // std::make_shared()); + // auto hiveConnector = + // connector::common::getConnectorFactory( + // HiveConnectorFactory::kHiveConnectorName) + // ->newConnector( + // kHiveConnectorId, + // std::make_shared( + // std::unordered_map()), + // ioExecutor_.get()); + // connector::common::registerConnector(hiveConnector); + objectFactory_ = &facebook::velox::connector::common::connector::common:: + ConnectorObjectFactoryRegistry::instance() + .factoryFor(kHiveConnectorName); + connectorId_ = "test-hive"; } void HiveConnectorTestBase::TearDown() { // Make sure all pending loads are finished or cancelled before unregister // connector. ioExecutor_.reset(); - connector::unregisterConnector(kHiveConnectorId); - connector::unregisterConnectorFactory( - connector::hive::HiveConnectorFactory::kHiveConnectorName); + // connector::common::unregisterConnector(kHiveConnectorId); + // connector::common::unregisterConnectorFactory( + // HiveConnectorFactory::kHiveConnectorName); OperatorTestBase::TearDown(); } void HiveConnectorTestBase::resetHiveConnector( const std::shared_ptr& config) { - connector::unregisterConnector(kHiveConnectorId); + connector::common::unregisterConnector(kHiveConnectorId); auto hiveConnector = - connector::getConnectorFactory( - connector::hive::HiveConnectorFactory::kHiveConnectorName) + connector::common::getConnectorFactory( + connector::common::kHiveConnectorName) ->newConnector(kHiveConnectorId, config, ioExecutor_.get()); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); } std::shared_ptr HiveConnectorTestBase::assertQuery( @@ -65,7 +72,8 @@ std::shared_ptr HiveConnectorTestBase::assertQuery( std::shared_ptr HiveConnectorTestBase::assertQuery( const core::PlanNodePtr& plan, - const std::vector>& splits, + const std::vector>& + splits, const std::string& duckDbSql, const int32_t numPrefetchSplit) { return AssertQueryBuilder(plan, duckDbQueryRunner_) @@ -87,7 +95,22 @@ std::vector> HiveConnectorTestBase::makeFilePaths( return filePaths; } -std::vector> +std::vector> +HiveConnectorTestBase::makeHiveConnectorSplits( + const std::vector>& filePaths) { + std::vector> splits; + for (auto filePath : filePaths) { + splits.push_back(makeHiveConnectorSplit( + filePath->getPath(), + filePath->fileSize(), + filePath->fileModifiedTime(), + 0, + std::numeric_limits::max())); + } + return splits; +} + +std::vector> HiveConnectorTestBase::makeHiveConnectorSplits( const std::string& filePath, uint32_t splitCount, @@ -97,114 +120,126 @@ HiveConnectorTestBase::makeHiveConnectorSplits( partitionKeys, const std::optional>& infoColumns) { + auto& factory = + connector::common::ConnectorObjectFactoryRegistry::instance().factoryFor( + kHiveConnectorName); + auto file = filesystems::getFileSystem(filePath, nullptr)->openFileForRead(filePath); const int64_t fileSize = file->size(); // Take the upper bound. const int64_t splitSize = std::ceil((fileSize) / splitCount); - std::vector> splits; + std::vector> splits; // Add all the splits. for (int i = 0; i < splitCount; i++) { - auto splitBuilder = HiveConnectorSplitBuilder(filePath) - .fileFormat(format) - .start(i * splitSize) - .length(splitSize); - if (infoColumns.has_value()) { - for (auto infoColumn : infoColumns.value()) { - splitBuilder.infoColumn(infoColumn.first, infoColumn.second); - } - } - if (partitionKeys.has_value()) { - for (auto partitionKey : partitionKeys.value()) { - splitBuilder.partitionKey(partitionKey.first, partitionKey.second); - } - } - - auto split = splitBuilder.build(); + auto split = makeHiveConnectorSplit( + filePath, i * splitSize, splitSize, format, infoColumns, partitionKeys); splits.push_back(std::move(split)); } return splits; } -std::unique_ptr -HiveConnectorTestBase::makeColumnHandle( - const std::string& name, - const TypePtr& type, - const std::vector& requiredSubfields) { - return makeColumnHandle(name, type, type, requiredSubfields); -} - -std::unique_ptr -HiveConnectorTestBase::makeColumnHandle( - const std::string& name, - const TypePtr& dataType, - const TypePtr& hiveType, - const std::vector& requiredSubfields, - connector::hive::HiveColumnHandle::ColumnType columnType) { - std::vector subfields; - subfields.reserve(requiredSubfields.size()); - for (auto& path : requiredSubfields) { - subfields.emplace_back(path); - } - - return std::make_unique( - name, columnType, dataType, hiveType, std::move(subfields)); -} - -std::vector> -HiveConnectorTestBase::makeHiveConnectorSplits( - const std::vector>& filePaths) { - std::vector> splits; - for (auto filePath : filePaths) { - splits.push_back(makeHiveConnectorSplit( - filePath->getPath(), - filePath->fileSize(), - filePath->fileModifiedTime(), - 0, - std::numeric_limits::max())); - } - return splits; -} - -std::shared_ptr +std::shared_ptr HiveConnectorTestBase::makeHiveConnectorSplit( const std::string& filePath, uint64_t start, uint64_t length, int64_t splitWeight, bool cacheable) { - return HiveConnectorSplitBuilder(filePath) - .start(start) - .length(length) - .splitWeight(splitWeight) - .cacheable(cacheable) - .build(); + folly::dynamic options = folly::dynamic::object(); + options["splitWeight"] = splitWeight; + options["cacheable"] = cacheable; + return objectFactory_->makeConnectorSplit(filePath, start, length, options); } -std::shared_ptr +std::shared_ptr HiveConnectorTestBase::makeHiveConnectorSplit( const std::string& filePath, int64_t fileSize, int64_t fileModifiedTime, uint64_t start, uint64_t length) { - return HiveConnectorSplitBuilder(filePath) - .infoColumn("$file_size", fmt::format("{}", fileSize)) - .infoColumn("$file_modified_time", fmt::format("{}", fileModifiedTime)) - .start(start) - .length(length) - .build(); + std::unordered_map infoColumns = {{"$file_size", fmt::format("{}", fileSize}, {"$file_modified_time", fmt::format("{}", fileModifiedTime)}}; + + folly::dynamic options = folly::dynamic::object(); + options["fileSize"] = fileSize; + options["fileModifiedTime"] = fileModifiedTime; + options["infoColumns"] = infoColumns; + return objectFactory_->makeConnectorSplit(filePath, start, length, options); +} + +std::shared_ptr +HiveConnectorTestBase::makeHiveConnectorSplit( + const std::string& filePath, + uint64_t start, + uint64_t length, + int64_t splitWeight, + bool cacheable, + dwio::common::FileFormat fileFormat, + const std::unordered_map& infoColumns, + const std::unordered_map& partitionKeys) { + folly::dynamic options = folly::dynamic::object(); + options["splitWeight"] = splitWeight; + options["cacheable"] = cacheable; + options["fileFormat"] = fileFormat; + options["infoColumns"] = infoColumns; + options["partitionKeys"] = infoColumns; + return objectFactory_->makeConnectorSplit(filePath, start, length, options); +} + + std::shared_ptr +HiveConnectorTestBase::makeTableHandle( + velox::common::SubfieldFilters subfieldFilters = {}, + const core::TypedExprPtr& remainingFilter = nullptr, + const std::string& tableName = "hive_table", + const RowTypePtr& dataColumns = nullptr, + bool filterPushdownEnabled = true, + const std::unordered_map& tableParameters = {}) { + folly::dynamic options = folly::dynamic::object(); + options["filterPushdownEnabled"] = filterPushdownEnabled; + options["subfieldFilters"] = subfieldFilters.toDynamic(); +options["remainingFilter"] = + remainingFilter + ? folly::dynamic::object("expr", serializeTypedExpr(remainingFilter)) + : folly::dynamic()); +return objectFactory_->makeTableHandle(tableName, dataColumns, options); +} + +std::unique_ptr +HiveConnectorTestBase::makeColumnHandle( + const std::string& name, + const TypePtr& type, + const std::vector& requiredSubfields) { + return makeColumnHandle(name, type, type, requiredSubfields); +} + +std::unique_ptr +HiveConnectorTestBase::makeColumnHandle( + const std::string& name, + const TypePtr& dataType, + const TypePtr& hiveType, + const std::vector& requiredSubfields, + folly::dynamic columnType) { + // HiveColumnHandle::ColumnType columnType) { + folly::dynamic options = folly::dynamic::object; + + options["hiveTypeKind"] = hiveType->kind(); + options["hiveType"] = hiveType->serialize(); + options["requiredSubfields"] = requiredSubfields; + options["columnType"] = std::move(columnType); + + return objectFactory_->makeColumnHandle(name, dataType, options); } // static -std::shared_ptr +std::shared_ptr HiveConnectorTestBase::makeHiveInsertTableHandle( const std::vector& tableColumnNames, const std::vector& tableColumnTypes, const std::vector& partitionedBy, - std::shared_ptr locationHandle, + std::shared_ptr locationHandle, const dwio::common::FileFormat tableStorageFormat, - const std::optional compressionKind, + const std::optional compressionKind, const std::shared_ptr& writerOptions, const bool ensureFiles) { return makeHiveInsertTableHandle( @@ -221,24 +256,24 @@ HiveConnectorTestBase::makeHiveInsertTableHandle( } // static -std::shared_ptr + +std::shared_ptr HiveConnectorTestBase::makeHiveInsertTableHandle( const std::vector& tableColumnNames, const std::vector& tableColumnTypes, const std::vector& partitionedBy, - std::shared_ptr bucketProperty, - std::shared_ptr locationHandle, - const dwio::common::FileFormat tableStorageFormat, - const std::optional compressionKind, + std::shared_ptr locationHandle, + const dwio::common::FileFormat fileFormat, + const std::optional compressionKind, const std::unordered_map& serdeParameters, const std::shared_ptr& writerOptions, - const bool ensureFiles) { - std::vector> + const bool ensureFiles, + folly::dynamic options) { + std::vector> columnHandles; std::vector bucketedBy; std::vector bucketedTypes; - std::vector> - sortedBy; + std::vector> sortedBy; if (bucketProperty != nullptr) { bucketedBy = bucketProperty->bucketedBy(); bucketedTypes = bucketProperty->bucketedTypes(); @@ -264,16 +299,17 @@ HiveConnectorTestBase::makeHiveInsertTableHandle( tableColumnNames.at(i)) != partitionedBy.cend()) { ++numPartitionColumns; columnHandles.push_back( - std::make_shared( + std::make_shared( tableColumnNames.at(i), - connector::hive::HiveColumnHandle::ColumnType::kPartitionKey, + connector::common::ConnectorColumnHandle::ColumnType:: + kPartitionKey, tableColumnTypes.at(i), tableColumnTypes.at(i))); } else { columnHandles.push_back( - std::make_shared( + std::make_shared( tableColumnNames.at(i), - connector::hive::HiveColumnHandle::ColumnType::kRegular, + connector::common::ConnectorColumnHandle::ColumnType::kRegular, tableColumnTypes.at(i), tableColumnTypes.at(i))); } @@ -282,48 +318,81 @@ HiveConnectorTestBase::makeHiveInsertTableHandle( VELOX_CHECK_EQ(numBucketColumns, bucketedBy.size()); VELOX_CHECK_EQ(numSortingColumns, sortedBy.size()); - return std::make_shared( - columnHandles, - locationHandle, - tableStorageFormat, - bucketProperty, + // Wrap Hive specific parameters into folly::dynamic to avoid direct reference + // to connectors/hive/ headers + folly::dynamic options = folly::dynamic::object; + options["partitionedBy"] = partitionedBy; + options["bucketProperty"] = bucketProperty->serialize(); + options["locationHandle"] = locationHandle->serialize(); + options["fileFormat"] = static_cast(fileFormat); + options["serdeParameters"] = serdeParameters; + options["writerOptions"] = writerOptions->serialize(); + options["ensureFiles"] = ensureFiles; + + return objectFactory_->makeInsertTableHandle( + tableColumnNames, + tableColumnTypes, + std::move(locationHandle), compressionKind, - serdeParameters, - writerOptions, - ensureFiles); + options); } -std::shared_ptr +std::unique_ptr HiveConnectorTestBase::regularColumn( const std::string& name, const TypePtr& type) { - return std::make_shared( - name, - connector::hive::HiveColumnHandle::ColumnType::kRegular, - type, - type); + // No Hive header here—just a string tag. + folly::dynamic options = + folly::dynamic::object("columnType", connector::common::kColumnTypeRegular); + return objectFactory_->makeColumnHandle(name, type, options); } -std::shared_ptr -HiveConnectorTestBase::synthesizedColumn( +std::unique_ptr +HiveConnectorTestBase::partitionKey( const std::string& name, const TypePtr& type) { - return std::make_shared( - name, - connector::hive::HiveColumnHandle::ColumnType::kSynthesized, - type, - type); + folly::dynamic options = + folly::dynamic::object("columnType", connector::common::kColumnTypePartition); + return objectFactory_->makeColumnHandle(name, type, options); } -std::shared_ptr -HiveConnectorTestBase::partitionKey( +std::unique_ptr +HiveConnectorTestBase::synthesizedColumn( const std::string& name, const TypePtr& type) { - return std::make_shared( - name, - connector::hive::HiveColumnHandle::ColumnType::kPartitionKey, - type, - type); + folly::dynamic options = + folly::dynamic::object("columnType", connector::common::kColumnTypeSynthesized); + return objectFactory_->makeColumnHandle(name, type, options); } +// std::shared_ptr +// HiveConnectorTestBase::regularColumn( +// const std::string& name, +// const TypePtr& type) { +// return std::make_shared( +// name, HiveColumnHandle::ColumnType::kRegular, type, type); +// } +// +// std::shared_ptr +// HiveConnectorTestBase::synthesizedColumn( +// const std::string& name, +// const TypePtr& type) { +// return std::make_shared( +// name, +// HiveColumnHandle::ColumnType::kSynthesized, +// type, +// type); +// } +// +// std::shared_ptr +// HiveConnectorTestBase::partitionKey( +// const std::string& name, +// const TypePtr& type) { +// return std::make_shared( +// name, +// HiveColumnHandle::ColumnType::kPartitionKey, +// type, +// type); +// } + } // namespace facebook::velox::exec::test diff --git a/velox/exec/tests/utils/HiveConnectorTestBase.h b/velox/exec/tests/utils/HiveConnectorTestBase.h index e63df79a898e..e52f72797865 100644 --- a/velox/exec/tests/utils/HiveConnectorTestBase.h +++ b/velox/exec/tests/utils/HiveConnectorTestBase.h @@ -15,10 +15,12 @@ */ #pragma once -#include "velox/connectors/hive/HiveConnector.h" -#include "velox/connectors/hive/HiveConnectorSplit.h" -#include "velox/connectors/hive/HiveDataSink.h" -#include "velox/connectors/hive/TableHandle.h" +// #include "velox/connectors/hive/HiveConnector.h" +// #include "velox/connectors/hive/HiveConnectorObjectFactory.h" +// #include "velox/connectors/hive/HiveConnectorSplit.h" +// #include "velox/connectors/hive/HiveDataSink.h" + +#include "velox/connectors/common/ConnectorObjectFactory.h" #include "velox/exec/Operator.h" #include "velox/exec/tests/utils/OperatorTestBase.h" #include "velox/exec/tests/utils/TempFilePath.h" @@ -26,11 +28,13 @@ namespace facebook::velox::exec::test { -static const std::string kHiveConnectorId = "test-hive"; +// Use inline constexpr instead of const std::string for better efficiency and +// less errors +inline constexpr const std::string kHiveConnectorId = "test-hive"; using ConnectorColumnHandleMap = std::unordered_map< std::string, - std::shared_ptr>; + std::shared_ptr>; class HiveConnectorTestBase : public OperatorTestBase { public: @@ -50,17 +54,31 @@ class HiveConnectorTestBase : public OperatorTestBase { std::shared_ptr assertQuery( const core::PlanNodePtr& plan, - const std::vector>& splits, + const std::vector>& + splits, const std::string& duckDbSql, const int32_t numPrefetchSplit); static std::vector> makeFilePaths(int count); - static std::vector> + static std::vector> makeHiveConnectorSplits( const std::vector>& filePaths); - static std::shared_ptr + /// Split file at path 'filePath' into 'splitCount' splits. If not local file, + /// file size can be given as 'externalSize'. + static std::vector> + makeHiveConnectorSplits( + const std::string& filePath, + uint32_t splitCount = 1, + dwio::common::FileFormat format = dwio::common::FileFormat::DWRF, + const std::optional< + std::unordered_map>>& + partitionKeys = {}, + const std::optional>& + infoColumns = {}); + + static std::shared_ptr makeHiveConnectorSplit( const std::string& filePath, uint64_t start = 0, @@ -68,7 +86,7 @@ class HiveConnectorTestBase : public OperatorTestBase { int64_t splitWeight = 0, bool cacheable = true); - static std::shared_ptr + static std::shared_ptr makeHiveConnectorSplit( const std::string& filePath, int64_t fileSize, @@ -76,41 +94,40 @@ class HiveConnectorTestBase : public OperatorTestBase { uint64_t start, uint64_t length); - /// Split file at path 'filePath' into 'splitCount' splits. If not local file, - /// file size can be given as 'externalSize'. - static std::vector> - makeHiveConnectorSplits( + std::shared_ptr makeHiveConnectorSplit( const std::string& filePath, - uint32_t splitCount, - dwio::common::FileFormat format, - const std::optional< - std::unordered_map>>& - partitionKeys = {}, - const std::optional>& - infoColumns = {}); - - static std::shared_ptr makeTableHandle( - common::SubfieldFilters subfieldFilters = {}, + uint64_t start, + uint64_t length, + int64_t splitWeight = 0, + bool cacheable = true, + dwio::common::FileFormat fileFormat = dwio::common::FileFormat::DWRF, + const std::unordered_map& infoColumns = {}, + const std::unordered_map& partitionKeys = {}); + + static std::shared_ptr + makeTableHandle( + velox::common::SubfieldFilters subfieldFilters = {}, const core::TypedExprPtr& remainingFilter = nullptr, const std::string& tableName = "hive_table", const RowTypePtr& dataColumns = nullptr, bool filterPushdownEnabled = true, - const std::unordered_map& tableParameters = - {}) { - return std::make_shared( - kHiveConnectorId, - tableName, - filterPushdownEnabled, - std::move(subfieldFilters), - remainingFilter, - dataColumns, - tableParameters); - } + const std::unordered_map& tableParameters = {}); + // { + // return std::make_shared( + // kHiveConnectorId, + // tableName, + // filterPushdownEnabled, + // std::move(subfieldFilters), + // remainingFilter, + // dataColumns, + // tableParameters); + // } /// @param name Column name. /// @param type Column type. /// @param Required subfields of this column. - static std::unique_ptr makeColumnHandle( + static std::unique_ptr + makeColumnHandle( const std::string& name, const TypePtr& type, const std::vector& requiredSubfields); @@ -119,25 +136,26 @@ class HiveConnectorTestBase : public OperatorTestBase { /// @param type Column type. /// @param type Hive type. /// @param Required subfields of this column. - static std::unique_ptr makeColumnHandle( + static std::unique_ptr + makeColumnHandle( const std::string& name, const TypePtr& dataType, const TypePtr& hiveType, const std::vector& requiredSubfields, - connector::hive::HiveColumnHandle::ColumnType columnType = - connector::hive::HiveColumnHandle::ColumnType::kRegular); + HiveColumnHandle::ColumnType columnType = + HiveColumnHandle::ColumnType::kRegular); /// @param targetDirectory Final directory of the target table after commit. /// @param writeDirectory Write directory of the target table before commit. /// @param tableType Whether to create a new table, insert into an existing /// table, or write a temporary table. /// @param writeMode How to write to the target directory. - static std::shared_ptr makeLocationHandle( + static std::shared_ptr makeLocationHandle( std::string targetDirectory, std::optional writeDirectory = std::nullopt, - connector::hive::LocationHandle::TableType tableType = - connector::hive::LocationHandle::TableType::kNew) { - return std::make_shared( + connector::common::LocationHandle::TableType tableType = + connector::common::LocationHandle::TableType::kNew) { + return std::make_shared( targetDirectory, writeDirectory.value_or(targetDirectory), tableType); } @@ -154,45 +172,43 @@ class HiveConnectorTestBase : public OperatorTestBase { /// @param serdeParameters Table writer configuration parameters. /// @param ensureFiles When this option is set the HiveDataSink will always /// create a file even if there is no data. - static std::shared_ptr + static std::shared_ptr makeHiveInsertTableHandle( const std::vector& tableColumnNames, const std::vector& tableColumnTypes, const std::vector& partitionedBy, - std::shared_ptr bucketProperty, - std::shared_ptr locationHandle, - const dwio::common::FileFormat tableStorageFormat = + std::shared_ptr locationHandle, + const dwio::common::FileFormat fileFormat = dwio::common::FileFormat::DWRF, - const std::optional compressionKind = {}, + const std::optional compressionKind = {}, const std::unordered_map& serdeParameters = {}, const std::shared_ptr& writerOptions = nullptr, - const bool ensureFiles = false); + const bool ensureFiles = false, + folly::dynamic options = nullptr); - static std::shared_ptr + static std::shared_ptr makeHiveInsertTableHandle( const std::vector& tableColumnNames, const std::vector& tableColumnTypes, const std::vector& partitionedBy, - std::shared_ptr locationHandle, - const dwio::common::FileFormat tableStorageFormat = + std::shared_ptr locationHandle, + const dwio::common::FileFormat fileFormat = dwio::common::FileFormat::DWRF, - const std::optional compressionKind = {}, + const std::optional compressionKind = {}, const std::shared_ptr& writerOptions = nullptr, const bool ensureFiles = false); - static std::shared_ptr regularColumn( - const std::string& name, - const TypePtr& type); + static std::shared_ptr + regularColumn(const std::string& name, const TypePtr& type); - static std::shared_ptr partitionKey( + static std::shared_ptr partitionKey( const std::string& name, const TypePtr& type); - static std::shared_ptr synthesizedColumn( - const std::string& name, - const TypePtr& type); + static std::shared_ptr + synthesizedColumn(const std::string& name, const TypePtr& type); static ConnectorColumnHandleMap allRegularColumns(const RowTypePtr& rowType) { ConnectorColumnHandleMap assignments; @@ -203,17 +219,21 @@ class HiveConnectorTestBase : public OperatorTestBase { } return assignments; } -}; -/// Same as connector::hive::HiveConnectorBuilder, except that this defaults -/// connectorId to kHiveConnectorId. -class HiveConnectorSplitBuilder - : public connector::hive::HiveConnectorSplitBuilder { - public: - explicit HiveConnectorSplitBuilder(std::string filePath) - : connector::hive::HiveConnectorSplitBuilder(filePath) { - connectorId(kHiveConnectorId); - } + private: + static const facebook::velox::connector::common::ConnectorObjectFactory* + objectFactory_; + std::string connectorId_; }; +///// Same as HiveConnectorBuilder, except that this defaults +///// connectorId to kHiveConnectorId. +// class HiveConnectorSplitBuilder : public HiveConnectorSplitBuilder { +// public: +// explicit HiveConnectorSplitBuilder(std::string filePath) +// : HiveConnectorSplitBuilder(filePath) { +// connectorId(kHiveConnectorId); +// } +// }; + } // namespace facebook::velox::exec::test diff --git a/velox/exec/tests/utils/IndexLookupJoinTestBase.cpp b/velox/exec/tests/utils/IndexLookupJoinTestBase.cpp index fb2bf1b7f4fa..89e11f50bf69 100644 --- a/velox/exec/tests/utils/IndexLookupJoinTestBase.cpp +++ b/velox/exec/tests/utils/IndexLookupJoinTestBase.cpp @@ -280,12 +280,12 @@ facebook::velox::core::TableScanNodePtr IndexLookupJoinTestBase::makeIndexScanNode( const std::shared_ptr& planNodeIdGenerator, - const std::shared_ptr + const std::shared_ptr indexTableHandle, const facebook::velox::RowTypePtr& outputType, std::unordered_map< std::string, - std::shared_ptr>& + std::shared_ptr>& assignments) { auto planBuilder = facebook::velox::exec::test::PlanBuilder( planNodeIdGenerator, pool_.get()); diff --git a/velox/exec/tests/utils/IndexLookupJoinTestBase.h b/velox/exec/tests/utils/IndexLookupJoinTestBase.h index b8712b9508e3..cf7ba146edd3 100644 --- a/velox/exec/tests/utils/IndexLookupJoinTestBase.h +++ b/velox/exec/tests/utils/IndexLookupJoinTestBase.h @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "velox/connectors/Connector.h" +#include "../../../connectors/common/Connector.h" #include "velox/core/PlanNode.h" #include "velox/exec/tests/utils/AssertQueryBuilder.h" #include "velox/exec/tests/utils/HiveConnectorTestBase.h" @@ -122,12 +122,12 @@ class IndexLookupJoinTestBase facebook::velox::core::TableScanNodePtr makeIndexScanNode( const std::shared_ptr& planNodeIdGenerator, - const std::shared_ptr + const std::shared_ptr indexTableHandle, const facebook::velox::RowTypePtr& outputType, std::unordered_map< std::string, - std::shared_ptr>& + std::shared_ptr>& assignments); /// Generate sequence storage table which will be persisted by mock zippydb diff --git a/velox/exec/tests/utils/LocalExchangeSource.cpp b/velox/exec/tests/utils/LocalExchangeSource.cpp index ca5545fd5a3f..fddaf72c89ce 100644 --- a/velox/exec/tests/utils/LocalExchangeSource.cpp +++ b/velox/exec/tests/utils/LocalExchangeSource.cpp @@ -106,12 +106,12 @@ class LocalExchangeSource : public exec::ExchangeSource { numPages_ += pages.size(); totalBytes_ += totalBytes; if (data.empty()) { - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::exec::test::LocalExchangeSource::timeout", this); } try { - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::exec::test::LocalExchangeSource", this); } catch (const std::exception& e) { queue_->setError(e.what()); @@ -165,7 +165,7 @@ class LocalExchangeSource : public exec::ExchangeSource { } void pause() override { - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::exec::test::LocalExchangeSource::pause", nullptr); auto buffers = OutputBufferManager::getInstanceRef(); VELOX_CHECK_NOT_NULL(buffers, "invalid OutputBufferManager"); diff --git a/velox/exec/tests/utils/LocalRunnerTestBase.cpp b/velox/exec/tests/utils/LocalRunnerTestBase.cpp index 398df844ef17..a41c3d454292 100644 --- a/velox/exec/tests/utils/LocalRunnerTestBase.cpp +++ b/velox/exec/tests/utils/LocalRunnerTestBase.cpp @@ -57,19 +57,19 @@ void LocalRunnerTestBase::ensureTestData() { } void LocalRunnerTestBase::setupConnector() { - connector::unregisterConnector(kHiveConnectorId); + connector::common::unregisterConnector(kHiveConnectorId); std::unordered_map configs; configs[connector::hive::HiveConfig::kLocalDataPath] = testDataPath_; configs[connector::hive::HiveConfig::kLocalFileFormat] = localFileFormat_; auto hiveConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector( kHiveConnectorId, std::make_shared(std::move(configs)), ioExecutor_.get()); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); } void LocalRunnerTestBase::makeTables( @@ -107,19 +107,20 @@ LocalRunnerTestBase::makeSimpleSplitSourceFactory( const runner::MultiFragmentPlanPtr& plan) { std::unordered_map< core::PlanNodeId, - std::vector>> + std::vector>> nodeSplitMap; for (auto& fragment : plan->fragments()) { for (auto& scan : fragment.scans) { auto& name = scan->tableHandle()->name(); auto files = tableFilePaths_[name]; VELOX_CHECK(!files.empty(), "No splits known for {}", name); - std::vector> splits; + std::vector> splits; for (auto& file : files) { - splits.push_back(connector::hive::HiveConnectorSplitBuilder(file) - .connectorId(kHiveConnectorId) - .fileFormat(dwio::common::FileFormat::DWRF) - .build()); + splits.push_back(makeHiveConnectorSplit(file)); +// splits.push_back(connector::hive::HiveConnectorSplitBuilder(file) +// .connectorId(kHiveConnectorId) +// .fileFormat(dwio::common::FileFormat::DWRF) +// .build()); } nodeSplitMap[scan->id()] = std::move(splits); } diff --git a/velox/exec/tests/utils/OperatorTestBase.cpp b/velox/exec/tests/utils/OperatorTestBase.cpp index 7a905c23b3e9..4705955a59d0 100644 --- a/velox/exec/tests/utils/OperatorTestBase.cpp +++ b/velox/exec/tests/utils/OperatorTestBase.cpp @@ -182,7 +182,7 @@ void OperatorTestBase::TearDown() { std::shared_ptr OperatorTestBase::assertQuery( const core::PlanNodePtr& plan, - const std::vector>& + const std::vector>& connectorSplits, const std::string& duckDbSql, std::optional> sortingKeys) { diff --git a/velox/exec/tests/utils/OperatorTestBase.h b/velox/exec/tests/utils/OperatorTestBase.h index 101e77807e6f..96ff35dd6233 100644 --- a/velox/exec/tests/utils/OperatorTestBase.h +++ b/velox/exec/tests/utils/OperatorTestBase.h @@ -99,7 +99,7 @@ class OperatorTestBase : public virtual testing::Test, /// Assumes plan has a single leaf node. All splits are added to that node. std::shared_ptr assertQueryOrdered( const core::PlanNodePtr& plan, - const std::vector>& splits, + const std::vector>& splits, const std::string& duckDbSql, const std::vector& sortingKeys) { return assertQuery(plan, splits, duckDbSql, sortingKeys); @@ -136,7 +136,7 @@ class OperatorTestBase : public virtual testing::Test, /// Assumes plan has a single leaf node. All splits are added to that node. std::shared_ptr assertQuery( const core::PlanNodePtr& plan, - const std::vector>& + const std::vector>& connectorSplits, const std::string& duckDbSql, std::optional> sortingKeys = std::nullopt); @@ -168,6 +168,8 @@ class OperatorTestBase : public virtual testing::Test, RowTypePtr rowType, const parse::ParseOptions& options = {}); + + void writeToFiles( const std::vector& filePaths, std::vector vectors); diff --git a/velox/exec/tests/utils/PlanBuilder.cpp b/velox/exec/tests/utils/PlanBuilder.cpp index 01101141cdbb..5407ff278ef8 100644 --- a/velox/exec/tests/utils/PlanBuilder.cpp +++ b/velox/exec/tests/utils/PlanBuilder.cpp @@ -74,7 +74,7 @@ PlanBuilder& PlanBuilder::tableScan( const RowTypePtr& dataColumns, const std::unordered_map< std::string, - std::shared_ptr>& assignments) { + std::shared_ptr>& assignments) { return TableScanBuilder(*this) .filtersAsNode(filtersAsNode_ ? planNodeIdGenerator_ : nullptr) .outputType(outputType) @@ -94,7 +94,7 @@ PlanBuilder& PlanBuilder::tableScan( const RowTypePtr& dataColumns, const std::unordered_map< std::string, - std::shared_ptr>& assignments) { + std::shared_ptr>& assignments) { return TableScanBuilder(*this) .filtersAsNode(filtersAsNode_ ? planNodeIdGenerator_ : nullptr) .tableName(tableName) @@ -114,7 +114,7 @@ PlanBuilder& PlanBuilder::tpchTableScan( std::string_view connectorId) { std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignmentsMap; std::vector outputTypes; @@ -203,7 +203,7 @@ core::PlanNodePtr PlanBuilder::TableScanBuilder::build(core::PlanNodeId id) { const RowTypePtr& parseType = dataColumns_ ? dataColumns_ : outputType_; core::TypedExprPtr filterNodeExpr; - common::SubfieldFilters filters; + velox::common::SubfieldFilters filters; filters.reserve(subfieldFilters_.size()); auto queryCtx = core::QueryCtx::create(); exec::SimpleExpressionEvaluator evaluator(queryCtx.get(), planBuilder_.pool_); @@ -219,7 +219,7 @@ core::PlanNodePtr PlanBuilder::TableScanBuilder::build(core::PlanNodeId id) { auto it = columnAliases_.find(subfield.toString()); if (it != columnAliases_.end()) { - subfield = common::Subfield(it->second); + subfield = velox::common::Subfield(it->second); } VELOX_CHECK_EQ( filters.count(subfield), @@ -290,10 +290,10 @@ core::PlanNodePtr PlanBuilder::TableWriterBuilder::build(core::PlanNodeId id) { outputType->childAt(i))); } - auto locationHandle = std::make_shared( + auto locationHandle = std::make_shared( outputDirectoryPath_, outputDirectoryPath_, - connector::hive::LocationHandle::TableType::kNew, + connector::common::LocationHandle::TableType::kNew, outputFileName_); std::shared_ptr bucketProperty; @@ -341,7 +341,7 @@ core::PlanNodePtr PlanBuilder::TableWriterBuilder::build(core::PlanNodeId id) { insertHandle_, false, TableWriteTraits::outputType(aggregationNode), - connector::CommitStrategy::kNoCommit, + connector::common::CommitStrategy::kNoCommit, upstreamNode); VELOX_CHECK(!writeNode->supportsBarrier()); return writeNode; @@ -579,7 +579,7 @@ PlanBuilder& PlanBuilder::tableWrite( const std::unordered_map& serdeParameters, const std::shared_ptr& options, const std::string& outputFileName, - const common::CompressionKind compressionKind, + const velox::common::CompressionKind compressionKind, const RowTypePtr& schema, const bool ensureFiles) { return TableWriterBuilder(*this) diff --git a/velox/exec/tests/utils/PlanBuilder.h b/velox/exec/tests/utils/PlanBuilder.h index 6fc72c163c4c..9a2807342c55 100644 --- a/velox/exec/tests/utils/PlanBuilder.h +++ b/velox/exec/tests/utils/PlanBuilder.h @@ -130,7 +130,7 @@ class PlanBuilder { /// types (for all columns) in this argument as opposed to 'outputType', where /// you define the output types only. See 'missingColumns' test in /// 'TableScanTest'. - /// @param assignments Optional ConnectorColumnHandles. + /// @param assignments Optional connector::common::ConnectorColumnHandles. PlanBuilder& tableScan( const RowTypePtr& outputType, const std::vector& subfieldFilters = {}, @@ -138,7 +138,7 @@ class PlanBuilder { const RowTypePtr& dataColumns = nullptr, const std::unordered_map< std::string, - std::shared_ptr>& assignments = {}); + std::shared_ptr>& assignments = {}); /// Add a TableScanNode to scan a Hive table. /// @@ -170,7 +170,7 @@ class PlanBuilder { const RowTypePtr& dataColumns = nullptr, const std::unordered_map< std::string, - std::shared_ptr>& assignments = {}); + std::shared_ptr>& assignments = {}); /// Add a TableScanNode to scan a TPC-H table. /// @@ -272,19 +272,19 @@ class PlanBuilder { /// @param tableHandle Optional tableHandle. Other builder arguments such as /// the `subfieldFilters` and `remainingFilter` will be ignored. TableScanBuilder& tableHandle( - std::shared_ptr tableHandle) { + std::shared_ptr tableHandle) { tableHandle_ = std::move(tableHandle); return *this; } - /// @param assignments Optional ConnectorColumnHandles. + /// @param assignments Optional connector::common::ConnectorColumnHandles. /// outputType names should match the keys in the 'assignments' map. The /// 'assignments' map may contain more columns than 'outputType' if some /// columns are only used by pushed-down filters. TableScanBuilder& assignments( std::unordered_map< std::string, - std::shared_ptr> assignments) { + std::shared_ptr> assignments) { assignments_ = std::move(assignments); return *this; } @@ -307,10 +307,10 @@ class PlanBuilder { core::ExprPtr remainingFilter_; RowTypePtr dataColumns_; std::unordered_map columnAliases_; - std::shared_ptr tableHandle_; + std::shared_ptr tableHandle_; std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments_; // produce filters as a FilterNode instead of pushdown. @@ -432,7 +432,7 @@ class PlanBuilder { /// @param compressionKind Compression scheme to use for writing the /// output data files. TableWriterBuilder& compressionKind( - common::CompressionKind compressionKind) { + velox::common::CompressionKind compressionKind) { compressionKind_ = compressionKind; return *this; } @@ -472,7 +472,7 @@ class PlanBuilder { std::shared_ptr options_; dwio::common::FileFormat fileFormat_{dwio::common::FileFormat::DWRF}; - common::CompressionKind compressionKind_{common::CompressionKind_NONE}; + velox::common::CompressionKind compressionKind_{common::CompressionKind_NONE}; bool ensureFiles_{false}; }; @@ -691,7 +691,7 @@ class PlanBuilder { const std::unordered_map& serdeParameters = {}, const std::shared_ptr& options = nullptr, const std::string& outputFileName = "", - const common::CompressionKind = common::CompressionKind_NONE, + const velox::common::CompressionKind = velox::common::CompressionKind_NONE, const RowTypePtr& schema = nullptr, const bool ensureFiles = false); diff --git a/velox/exec/tests/utils/QueryAssertions.h b/velox/exec/tests/utils/QueryAssertions.h index 3acfa88885d0..cd809188e2bc 100644 --- a/velox/exec/tests/utils/QueryAssertions.h +++ b/velox/exec/tests/utils/QueryAssertions.h @@ -128,9 +128,9 @@ class ScopedOOMInjector { } // Make sure TestValues are enabled. - common::testutil::TestValue::enable(); + velox::common::testutil::TestValue::enable(); - common::testutil::TestValue::set( + velox::common::testutil::TestValue::set( kInjectionPoint, std::function([&](memory::MemoryPool*) { const auto currentTime = now(); @@ -145,7 +145,7 @@ class ScopedOOMInjector { } ~ScopedOOMInjector() { - common::testutil::TestValue::clear(kInjectionPoint); + velox::common::testutil::TestValue::clear(kInjectionPoint); enabled_ = false; } diff --git a/velox/exec/tests/utils/TableWriterTestBase.cpp b/velox/exec/tests/utils/TableWriterTestBase.cpp index ebf2eb1f0599..5ce0033b7f61 100644 --- a/velox/exec/tests/utils/TableWriterTestBase.cpp +++ b/velox/exec/tests/utils/TableWriterTestBase.cpp @@ -16,1076 +16,1164 @@ #include "velox/exec/tests/utils/TableWriterTestBase.h" -namespace velox::exec::test { +#include "velox/connectors/common/ConnectorObjectFactoryRegistry.h" +#include "velox/connectors/common/ConnectorNames.h" -TableWriterTestBase::TestParam::TestParam( - FileFormat fileFormat, - TestMode testMode, - CommitStrategy commitStrategy, - HiveBucketProperty::Kind bucketKind, - bool bucketSort, - bool multiDrivers, - CompressionKind compressionKind, - bool scaleWriter) { - value = (scaleWriter ? 1ULL << 56 : 0) | - static_cast(compressionKind) << 48 | - static_cast(!!multiDrivers) << 40 | - static_cast(fileFormat) << 32 | - static_cast(testMode) << 24 | - static_cast(commitStrategy) << 16 | - static_cast(bucketKind) << 8 | !!bucketSort; -} - -CompressionKind TableWriterTestBase::TestParam::compressionKind() const { - return static_cast( - (value & ((1L << 56) - 1)) >> 48); -} - -bool TableWriterTestBase::TestParam::multiDrivers() const { - return (value >> 40) != 0; -} - -FileFormat TableWriterTestBase::TestParam::fileFormat() const { - return static_cast((value & ((1L << 40) - 1)) >> 32); -} - -TableWriterTestBase::TestMode TableWriterTestBase::TestParam::testMode() const { - return static_cast((value & ((1L << 32) - 1)) >> 24); -} - -CommitStrategy TableWriterTestBase::TestParam::commitStrategy() const { - return static_cast((value & ((1L << 24) - 1)) >> 16); -} - -HiveBucketProperty::Kind TableWriterTestBase::TestParam::bucketKind() const { - return static_cast((value & ((1L << 16) - 1)) >> 8); -} - -bool TableWriterTestBase::TestParam::bucketSort() const { - return (value & ((1L << 8) - 1)) != 0; -} - -bool TableWriterTestBase::TestParam::scaleWriter() const { - return (value >> 56) != 0; -} - -std::string TableWriterTestBase::TestParam::toString() const { - return fmt::format( - "FileFormat[{}] TestMode[{}] commitStrategy[{}] bucketKind[{}] bucketSort[{}] multiDrivers[{}] compression[{}] scaleWriter[{}]", - dwio::common::toString((fileFormat())), - testModeString(testMode()), - commitStrategyToString(commitStrategy()), - HiveBucketProperty::kindString(bucketKind()), - bucketSort(), - multiDrivers(), - compressionKindToString(compressionKind()), - scaleWriter()); -} +namespace facebook::velox::exec::test { -std::string TableWriterTestBase::testModeString(TestMode mode) { - switch (mode) { - case TestMode::kUnpartitioned: - return "UNPARTITIONED"; - case TestMode::kPartitioned: - return "PARTITIONED"; - case TestMode::kBucketed: - return "BUCKETED"; - case TestMode::kOnlyBucketed: - return "BUCKETED (NOT PARTITIONED)"; - } - VELOX_UNREACHABLE(); -} - -// static -std::shared_ptr -TableWriterTestBase::generateAggregationNode( - const std::string& name, - const std::vector& groupingKeys, - AggregationNode::Step step, - const PlanNodePtr& source) { - core::TypedExprPtr inputField = - std::make_shared(BIGINT(), name); - auto callExpr = std::make_shared( - BIGINT(), std::vector{inputField}, "min"); - std::vector aggregateNames = {"min"}; - std::vector aggregates = { - core::AggregationNode::Aggregate{ - callExpr, {{BIGINT()}}, nullptr, {}, {}}}; - return std::make_shared( - core::PlanNodeId(), - step, - groupingKeys, - std::vector{}, - aggregateNames, - aggregates, - false, // ignoreNullKeys - source); -} +using namespace facebook::velox::exec::test; +using namespace facebook::velox::connector::common; -// static. -std::function -TableWriterTestBase::addTableWriter( - const RowTypePtr& inputColumns, +std::shared_ptr +TableWriterTestBase::makeInsertTableHandle( const std::vector& tableColumnNames, - const std::shared_ptr& aggregationNode, - const std::shared_ptr& insertHandle, - bool hasPartitioningScheme, - connector::CommitStrategy commitStrategy) { - return [=](core::PlanNodeId nodeId, - core::PlanNodePtr source) -> core::PlanNodePtr { - return std::make_shared( - nodeId, - inputColumns, - tableColumnNames, - aggregationNode, - insertHandle, - hasPartitioningScheme, - TableWriteTraits::outputType(aggregationNode), - commitStrategy, - std::move(source)); - }; -} - -// static. -RowTypePtr TableWriterTestBase::getNonPartitionsColumns( - const std::vector& partitionedKeys, - const RowTypePtr& rowType) { - std::vector dataColumnNames; - std::vector dataColumnTypes; - for (auto i = 0; i < rowType->size(); i++) { - auto name = rowType->names()[i]; - if (std::find(partitionedKeys.cbegin(), partitionedKeys.cend(), name) == - partitionedKeys.cend()) { - dataColumnNames.emplace_back(name); - dataColumnTypes.emplace_back(rowType->findChild(name)); - } - } - return ROW(std::move(dataColumnNames), std::move(dataColumnTypes)); -} - -TableWriterTestBase::TableWriterTestBase(uint64_t testValue) - : testParam_(static_cast(testValue)), - fileFormat_(testParam_.fileFormat()), - testMode_(testParam_.testMode()), - numTableWriterCount_( - testParam_.multiDrivers() ? kNumTableWriterCount : 1), - numPartitionedTableWriterCount_( - testParam_.multiDrivers() ? kNumPartitionedTableWriterCount : 1), - commitStrategy_(testParam_.commitStrategy()), - compressionKind_(testParam_.compressionKind()), - scaleWriter_(testParam_.scaleWriter()) { - LOG(INFO) << testParam_.toString(); - auto rowType = - ROW({"c0", "c1", "c2", "c3", "c4", "c5"}, - {BIGINT(), INTEGER(), SMALLINT(), REAL(), DOUBLE(), VARCHAR()}); - setDataTypes(rowType); - if (testMode_ == TestMode::kPartitioned || testMode_ == TestMode::kBucketed) { - const std::vector partitionBy = {"c0", "c1"}; - setPartitionBy(partitionBy); - numPartitionKeyValues_ = {4, 4}; - } - if (testMode_ == TestMode::kBucketed || - testMode_ == TestMode::kOnlyBucketed) { - std::vector bucketedBy = {"c3", "c5"}; - std::vector bucketedTypes = {REAL(), VARCHAR()}; - std::vector> sortedBy; - if (testParam_.bucketSort()) { - // The sortedBy key shouldn't contain partitionBy key. - sortedBy = {std::make_shared( - "c4", core::SortOrder{true, true})}; - // The sortColumnIndices_ should represent the indices after removing - // the partition keys. - if (testMode_ == TestMode::kBucketed) { - sortColumnIndices_ = {2}; - } else { - sortColumnIndices_ = {4}; - } - sortedFlags_ = {{true, true}}; - } - bucketProperty_ = std::make_shared( - testParam_.bucketKind(), 4, bucketedBy, bucketedTypes, sortedBy); - } -} - -void TableWriterTestBase::SetUp() { - HiveConnectorTestBase::SetUp(); -} - -std::shared_ptr TableWriterTestBase::assertQueryWithWriterConfigs( - const core::PlanNodePtr& plan, - std::vector> filePaths, - const std::string& duckDbSql, - bool spillEnabled) { - std::vector splits; - for (const auto& filePath : filePaths) { - splits.push_back(Split(makeHiveConnectorSplit(filePath->getPath()))); - } - if (!spillEnabled) { - return AssertQueryBuilder(plan, duckDbQueryRunner_) - .maxDrivers( - 2 * std::max(kNumTableWriterCount, kNumPartitionedTableWriterCount)) - .config( - QueryConfig::kTaskWriterCount, std::to_string(numTableWriterCount_)) - .config( - QueryConfig::kTaskPartitionedWriterCount, - std::to_string(numPartitionedTableWriterCount_)) - // Scale writer settings to trigger partition rebalancing. - .config(QueryConfig::kScaleWriterRebalanceMaxMemoryUsageRatio, "1.0") - .config( - QueryConfig::kScaleWriterMinProcessedBytesRebalanceThreshold, "0") - .config( - QueryConfig:: - kScaleWriterMinPartitionProcessedBytesRebalanceThreshold, - "0") - .splits(splits) - .assertResults(duckDbSql); - } - const auto spillDirectory = TempDirectoryPath::create(); - TestScopedSpillInjection scopedSpillInjection(100); - return AssertQueryBuilder(plan, duckDbQueryRunner_) - .spillDirectory(spillDirectory->getPath()) - .maxDrivers( - 2 * std::max(kNumTableWriterCount, kNumPartitionedTableWriterCount)) - .config( - QueryConfig::kTaskWriterCount, std::to_string(numTableWriterCount_)) - .config( - QueryConfig::kTaskPartitionedWriterCount, - std::to_string(numPartitionedTableWriterCount_)) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(QueryConfig::kWriterSpillEnabled, "true") - // Scale writer settings to trigger partition rebalancing. - .config(QueryConfig::kScaleWriterRebalanceMaxMemoryUsageRatio, "1.0") - .config(QueryConfig::kScaleWriterMinProcessedBytesRebalanceThreshold, "0") - .config( - QueryConfig::kScaleWriterMinPartitionProcessedBytesRebalanceThreshold, - "0") - .splits(splits) - .assertResults(duckDbSql); -} - -std::shared_ptr TableWriterTestBase::assertQueryWithWriterConfigs( - const core::PlanNodePtr& plan, - const std::string& duckDbSql, - bool enableSpill) { - if (!enableSpill) { - TestScopedSpillInjection scopedSpillInjection(100); - return AssertQueryBuilder(plan, duckDbQueryRunner_) - .maxDrivers( - 2 * std::max(kNumTableWriterCount, kNumPartitionedTableWriterCount)) - .config( - QueryConfig::kTaskWriterCount, std::to_string(numTableWriterCount_)) - .config( - QueryConfig::kTaskPartitionedWriterCount, - std::to_string(numPartitionedTableWriterCount_)) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(QueryConfig::kWriterSpillEnabled, "true") - // Scale writer settings to trigger partition rebalancing. - .config(QueryConfig::kScaleWriterRebalanceMaxMemoryUsageRatio, "1.0") - .config( - QueryConfig::kScaleWriterMinProcessedBytesRebalanceThreshold, "0") - .config( - QueryConfig:: - kScaleWriterMinPartitionProcessedBytesRebalanceThreshold, - "0") - .assertResults(duckDbSql); - } - const auto spillDirectory = TempDirectoryPath::create(); - TestScopedSpillInjection scopedSpillInjection(100); - return AssertQueryBuilder(plan, duckDbQueryRunner_) - .spillDirectory(spillDirectory->getPath()) - .maxDrivers( - 2 * std::max(kNumTableWriterCount, kNumPartitionedTableWriterCount)) - .config( - QueryConfig::kTaskWriterCount, std::to_string(numTableWriterCount_)) - .config( - QueryConfig::kTaskPartitionedWriterCount, - std::to_string(numPartitionedTableWriterCount_)) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(QueryConfig::kWriterSpillEnabled, "true") - // Scale writer settings to trigger partition rebalancing. - .config(QueryConfig::kScaleWriterRebalanceMaxMemoryUsageRatio, "1.0") - .config(QueryConfig::kScaleWriterMinProcessedBytesRebalanceThreshold, "0") - .config( - QueryConfig::kScaleWriterMinPartitionProcessedBytesRebalanceThreshold, - "0") - .assertResults(duckDbSql); -} - -RowVectorPtr TableWriterTestBase::runQueryWithWriterConfigs( - const core::PlanNodePtr& plan, - bool spillEnabled) { - if (!spillEnabled) { - return AssertQueryBuilder(plan, duckDbQueryRunner_) - .maxDrivers( - 2 * std::max(kNumTableWriterCount, kNumPartitionedTableWriterCount)) - .config( - QueryConfig::kTaskWriterCount, std::to_string(numTableWriterCount_)) - .config( - QueryConfig::kTaskPartitionedWriterCount, - std::to_string(numPartitionedTableWriterCount_)) - // Scale writer settings to trigger partition rebalancing. - .config(QueryConfig::kScaleWriterRebalanceMaxMemoryUsageRatio, "1.0") - .config( - QueryConfig::kScaleWriterMinProcessedBytesRebalanceThreshold, "0") - .config( - QueryConfig:: - kScaleWriterMinPartitionProcessedBytesRebalanceThreshold, - "0") - .copyResults(pool()); - } - const auto spillDirectory = TempDirectoryPath::create(); - TestScopedSpillInjection scopedSpillInjection(100); - return AssertQueryBuilder(plan, duckDbQueryRunner_) - .spillDirectory(spillDirectory->getPath()) - .maxDrivers( - 2 * std::max(kNumTableWriterCount, kNumPartitionedTableWriterCount)) - .config( - QueryConfig::kTaskWriterCount, std::to_string(numTableWriterCount_)) - .config( - QueryConfig::kTaskPartitionedWriterCount, - std::to_string(numPartitionedTableWriterCount_)) - .config(core::QueryConfig::kSpillEnabled, "true") - .config(QueryConfig::kWriterSpillEnabled, "true") - // Scale writer settings to trigger partition rebalancing. - .config(QueryConfig::kScaleWriterRebalanceMaxMemoryUsageRatio, "1.0") - .config(QueryConfig::kScaleWriterMinProcessedBytesRebalanceThreshold, "0") - .config( - QueryConfig::kScaleWriterMinPartitionProcessedBytesRebalanceThreshold, - "0") - .copyResults(pool()); -} - -void TableWriterTestBase::setCommitStrategy(CommitStrategy commitStrategy) { - commitStrategy_ = commitStrategy; -} - -void TableWriterTestBase::setPartitionBy( - const std::vector& partitionBy) { - partitionedBy_ = partitionBy; - for (const auto& partitionColumn : partitionedBy_) { - for (int i = 0; i < rowType_->size(); ++i) { - if (rowType_->nameOf(i) == partitionColumn) { - partitionChannels_.emplace_back(i); - partitionTypes_.emplace_back(rowType_->childAt(i)); - } - } - } -} - -void TableWriterTestBase::setBucketProperty( - HiveBucketProperty::Kind kind, - uint32_t bucketCount, - const std::vector& bucketedBy, - const std::vector& bucketedTypes, - const std::vector>& sortedBy) { - bucketProperty_ = std::make_shared( - kind, bucketCount, bucketedBy, bucketedTypes, sortedBy); -} - -void TableWriterTestBase::setDataTypes( - const RowTypePtr& inputType, - const RowTypePtr& tableSchema) { - rowType_ = inputType; - if (tableSchema != nullptr) { - setTableSchema(tableSchema); - } else { - setTableSchema(rowType_); - } -} - -void TableWriterTestBase::setTableSchema(const RowTypePtr& tableSchema) { - tableSchema_ = tableSchema; -} - -std::vector> -TableWriterTestBase::makeHiveConnectorSplits( - const std::shared_ptr& directoryPath) { - return makeHiveConnectorSplits(directoryPath->getPath()); -} - -std::vector> -TableWriterTestBase::makeHiveConnectorSplits(const std::string& directoryPath) { - std::vector> splits; - for (auto& path : fs::recursive_directory_iterator(directoryPath)) { - if (path.is_regular_file()) { - splits.push_back(HiveConnectorTestBase::makeHiveConnectorSplits( - path.path().string(), 1, fileFormat_)[0]); - } - } - return splits; -} - -// Lists and returns all the regular files from a given directory -// recursively. -std::vector TableWriterTestBase::listAllFiles( - const std::string& directoryPath) { - std::vector files; - for (auto& path : fs::recursive_directory_iterator(directoryPath)) { - if (path.is_regular_file()) { - files.push_back(path.path().filename()); - } - } - return files; -} - -// Builds and returns the hive splits from the list of files with one split -// per each file. -std::vector> -TableWriterTestBase::makeHiveConnectorSplits( - const std::vector& filePaths) { - std::vector> splits; - for (const auto& filePath : filePaths) { - splits.push_back(HiveConnectorTestBase::makeHiveConnectorSplits( - filePath.string(), 1, fileFormat_)[0]); - } - return splits; -} - -std::vector TableWriterTestBase::makeVectors( - int32_t numVectors, - int32_t rowsPerVector) { - auto rowVectors = - HiveConnectorTestBase::makeVectors(rowType_, numVectors, rowsPerVector); - if (testMode_ == TestMode::kUnpartitioned || - testMode_ == TestMode::kOnlyBucketed) { - return rowVectors; - } - // In case of partitioned table write test case, we ensure the number of - // unique partition key values are capped. - for (auto& rowVector : rowVectors) { - auto c0PartitionVector = - makeFlatVector(rowsPerVector, [&](auto /*unused*/) { - return folly::Random().rand32() % numPartitionKeyValues_[0]; - }); - auto c1PartitionVector = - makeFlatVector(rowsPerVector, [&](auto /*unused*/) { - return folly::Random().rand32() % numPartitionKeyValues_[1]; - }); - rowVector->childAt(0) = c0PartitionVector; - rowVector->childAt(1) = c1PartitionVector; - } - return rowVectors; -} - -RowVectorPtr TableWriterTestBase::makeConstantVector(size_t size) { - return makeRowVector( - rowType_->names(), - {makeConstant((int64_t)123'456, size), - makeConstant((int32_t)321, size), - makeConstant((int16_t)12'345, size), - makeConstant(variant(TypeKind::REAL), size), - makeConstant((double)1'234.01, size), - makeConstant(variant(TypeKind::VARCHAR), size)}); -} - -std::vector TableWriterTestBase::makeBatches( - vector_size_t numBatches, - std::function makeVector) { - std::vector batches; - batches.reserve(numBatches); - for (int32_t i = 0; i < numBatches; ++i) { - batches.push_back(makeVector(i)); - } - return batches; -} - -std::set TableWriterTestBase::getLeafSubdirectories( - const std::string& directoryPath) { - std::set subdirectories; - for (auto& path : fs::recursive_directory_iterator(directoryPath)) { - if (path.is_regular_file()) { - subdirectories.emplace(path.path().parent_path().string()); - } - } - return subdirectories; -} - -std::vector TableWriterTestBase::getRecursiveFiles( - const std::string& directoryPath) { - std::vector files; - for (auto& path : fs::recursive_directory_iterator(directoryPath)) { - if (path.is_regular_file()) { - files.push_back(path.path().string()); - } - } - return files; -} - -uint32_t TableWriterTestBase::countRecursiveFiles( - const std::string& directoryPath) { - return getRecursiveFiles(directoryPath).size(); -} - -// Helper method to return InsertTableHandle. -std::shared_ptr -TableWriterTestBase::createInsertTableHandle( - const RowTypePtr& outputRowType, - const connector::hive::LocationHandle::TableType& outputTableType, - const std::string& outputDirectoryPath, - const std::vector& partitionedBy, - const std::shared_ptr bucketProperty, - const std::optional compressionKind) { - return std::make_shared( - kHiveConnectorId, - makeHiveInsertTableHandle( - outputRowType->names(), - outputRowType->children(), - partitionedBy, - bucketProperty, - makeLocationHandle( - outputDirectoryPath, std::nullopt, outputTableType), - fileFormat_, - compressionKind)); -} - -// Returns a table insert plan node. -PlanNodePtr TableWriterTestBase::createInsertPlan( - PlanBuilder& inputPlan, - const RowTypePtr& outputRowType, - const std::string& outputDirectoryPath, - const std::vector& partitionedBy, - std::shared_ptr bucketProperty, - const std::optional compressionKind, - int numTableWriters, - const connector::hive::LocationHandle::TableType& outputTableType, - const CommitStrategy& outputCommitStrategy, - bool aggregateResult, - std::shared_ptr aggregationNode) { - return createInsertPlan( - inputPlan, - inputPlan.planNode()->outputType(), - outputRowType, - outputDirectoryPath, - partitionedBy, - std::move(bucketProperty), - compressionKind, - numTableWriters, - outputTableType, - outputCommitStrategy, - aggregateResult, - aggregationNode); -} - -PlanNodePtr TableWriterTestBase::createInsertPlan( - PlanBuilder& inputPlan, - const RowTypePtr& inputRowType, - const RowTypePtr& tableRowType, - const std::string& outputDirectoryPath, + const std::vector& tableColumnTypes, const std::vector& partitionedBy, - std::shared_ptr bucketProperty, - const std::optional compressionKind, - int numTableWriters, - const connector::hive::LocationHandle::TableType& outputTableType, - const CommitStrategy& outputCommitStrategy, - bool aggregateResult, - std::shared_ptr aggregationNode) { - if (numTableWriters == 1) { - return createInsertPlanWithSingleWriter( - inputPlan, - inputRowType, - tableRowType, - outputDirectoryPath, - partitionedBy, - bucketProperty, - compressionKind, - outputTableType, - outputCommitStrategy, - aggregateResult, - aggregationNode); - } else if (bucketProperty_ == nullptr) { - return createInsertPlanWithForNonBucketedTable( - inputPlan, - inputRowType, - tableRowType, - outputDirectoryPath, - partitionedBy, - compressionKind, - outputTableType, - outputCommitStrategy, - aggregateResult, - aggregationNode); - } else { - return createInsertPlanForBucketTable( - inputPlan, - inputRowType, - tableRowType, - outputDirectoryPath, - partitionedBy, - bucketProperty, - compressionKind, - outputTableType, - outputCommitStrategy, - aggregateResult, - aggregationNode); - } -} + std::shared_ptr locationHandle, + dwio::common::FileFormat tableStorageFormat, + const std::optional compressionKind, + const std::unordered_map& serdeParameters, + const std::shared_ptr& writerOptions, + bool ensureFiles, + const folly::dynamic& bucketPropertyOptions) { + folly::dynamic options = folly::dynamic::object( + "partitionedBy", folly::dynamic::array())( + "serdeParameters", folly::dynamic::object())( + "fileFormat", static_cast(tableStorageFormat))( + "ensureFiles", ensureFiles); -PlanNodePtr TableWriterTestBase::createInsertPlanWithSingleWriter( - PlanBuilder& inputPlan, - const RowTypePtr& inputRowType, - const RowTypePtr& tableRowType, - const std::string& outputDirectoryPath, - const std::vector& partitionedBy, - std::shared_ptr bucketProperty, - const std::optional compressionKind, - const connector::hive::LocationHandle::TableType& outputTableType, - const CommitStrategy& outputCommitStrategy, - bool aggregateResult, - std::shared_ptr aggregationNode) { - const bool addScaleWriterExchange = - scaleWriter_ && (bucketProperty != nullptr); - auto insertPlan = inputPlan; - if (addScaleWriterExchange) { - if (!partitionedBy.empty()) { - insertPlan.scaleWriterlocalPartition( - inputColumnNames(partitionedBy, tableRowType, inputRowType)); - } else { - insertPlan.scaleWriterlocalPartitionRoundRobin(); - } + for (auto& col : partitionedBy) { + options["partitionedBy"].push_back(col); } - insertPlan - .addNode(addTableWriter( - inputRowType, - tableRowType->names(), - aggregationNode, - createInsertTableHandle( - tableRowType, - outputTableType, - outputDirectoryPath, - partitionedBy, - bucketProperty, - compressionKind), - false, - outputCommitStrategy)) - .capturePlanNodeId(tableWriteNodeId_); - if (addScaleWriterExchange) { - if (!partitionedBy.empty()) { - insertPlan.scaleWriterlocalPartition( - inputColumnNames(partitionedBy, tableRowType, inputRowType)); - } else { - insertPlan.scaleWriterlocalPartitionRoundRobin(); - } - } - if (aggregateResult) { - insertPlan.project({TableWriteTraits::rowCountColumnName()}) - .singleAggregation( - {}, - {fmt::format("sum({})", TableWriteTraits::rowCountColumnName())}); - } - return insertPlan.planNode(); -} -PlanNodePtr TableWriterTestBase::createInsertPlanForBucketTable( - PlanBuilder& inputPlan, - const RowTypePtr& inputRowType, - const RowTypePtr& tableRowType, - const std::string& outputDirectoryPath, - const std::vector& partitionedBy, - std::shared_ptr bucketProperty, - const std::optional compressionKind, - const connector::hive::LocationHandle::TableType& outputTableType, - const CommitStrategy& outputCommitStrategy, - bool aggregateResult, - std::shared_ptr aggregationNode) { - // Since we might do column rename, so generate bucket property based on - // the data type from 'inputPlan'. - std::vector bucketColumns; - bucketColumns.reserve(bucketProperty->bucketedBy().size()); - for (int i = 0; i < bucketProperty->bucketedBy().size(); ++i) { - bucketColumns.push_back(inputRowType->names()[tableRowType->getChildIdx( - bucketProperty->bucketedBy()[i])]); + for (auto& kv : serdeParameters) { + options["serdeParameters"][kv.first] = kv.second; } - auto localPartitionBucketProperty = std::make_shared( - bucketProperty->kind(), - bucketProperty->bucketCount(), - bucketColumns, - bucketProperty->bucketedTypes(), - bucketProperty->sortedBy()); - auto insertPlan = - inputPlan.localPartitionByBucket(localPartitionBucketProperty) - .addNode(addTableWriter( - inputRowType, - tableRowType->names(), - nullptr, - createInsertTableHandle( - tableRowType, - outputTableType, - outputDirectoryPath, - partitionedBy, - bucketProperty, - compressionKind), - false, - outputCommitStrategy)) - .capturePlanNodeId(tableWriteNodeId_) - .localPartition({}) - .tableWriteMerge(); - if (aggregateResult) { - insertPlan.project({TableWriteTraits::rowCountColumnName()}) - .singleAggregation( - {}, - {fmt::format("sum({})", TableWriteTraits::rowCountColumnName())}); - } - return insertPlan.planNode(); -} -// static -std::vector TableWriterTestBase::inputColumnNames( - const std::vector& tableColumnNames, - const RowTypePtr& tableRowType, - const RowTypePtr& inputRowType) { - std::vector inputNames; - inputNames.reserve(tableColumnNames.size()); - for (const auto& tableColumnName : tableColumnNames) { - const auto columnIdx = tableRowType->getChildIdx(tableColumnName); - inputNames.push_back(inputRowType->nameOf(columnIdx)); + if (writerOptions) { + options["writerOptions"] = writerOptions; } - return inputNames; -} -PlanNodePtr TableWriterTestBase::createInsertPlanWithForNonBucketedTable( - PlanBuilder& inputPlan, - const RowTypePtr& inputRowType, - const RowTypePtr& tableRowType, - const std::string& outputDirectoryPath, - const std::vector& partitionedBy, - const std::optional compressionKind, - const connector::hive::LocationHandle::TableType& outputTableType, - const CommitStrategy& outputCommitStrategy, - bool aggregateResult, - std::shared_ptr aggregationNode) { - auto insertPlan = inputPlan; - if (scaleWriter_) { - if (!partitionedBy.empty()) { - insertPlan.scaleWriterlocalPartition( - inputColumnNames(partitionedBy, tableRowType, inputRowType)); - } else { - insertPlan.scaleWriterlocalPartitionRoundRobin(); - } + if (!bucketPropertyOptions.isNull()) { + options["bucketProperty"] = bucketPropertyOptions; } - insertPlan - .addNode(addTableWriter( - inputRowType, - tableRowType->names(), - nullptr, - createInsertTableHandle( - tableRowType, - outputTableType, - outputDirectoryPath, - partitionedBy, - nullptr, - compressionKind), - false, - outputCommitStrategy)) - .capturePlanNodeId(tableWriteNodeId_) - .localPartition(std::vector{}) - .tableWriteMerge(); - if (aggregateResult) { - insertPlan.project({TableWriteTraits::rowCountColumnName()}) - .singleAggregation( - {}, - {fmt::format("sum({})", TableWriteTraits::rowCountColumnName())}); - } - return insertPlan.planNode(); -} -std::string TableWriterTestBase::partitionNameToPredicate( - const std::string& partitionName, - const std::vector& partitionTypes) { - std::vector conjuncts; - std::vector partitionKeyValues; - folly::split('/', partitionName, partitionKeyValues); - VELOX_CHECK_EQ(partitionKeyValues.size(), partitionTypes.size()); - for (auto i = 0; i < partitionKeyValues.size(); ++i) { - if (partitionTypes[i]->isVarchar() || partitionTypes[i]->isVarbinary() || - partitionTypes[i]->isDate()) { - conjuncts.push_back(partitionKeyValues[i] - .replace(partitionKeyValues[i].find("="), 1, "='") - .append("'")); - } else { - conjuncts.push_back(partitionKeyValues[i]); - } - } - return folly::join(" AND ", conjuncts); -} - -std::string TableWriterTestBase::partitionNameToPredicate( - const std::vector& partitionDirNames) { - std::vector conjuncts; - VELOX_CHECK_EQ(partitionDirNames.size(), partitionTypes_.size()); - std::vector partitionKeyValues = partitionDirNames; - for (auto i = 0; i < partitionDirNames.size(); ++i) { - if (partitionTypes_[i]->isVarchar() || partitionTypes_[i]->isVarbinary() || - partitionTypes_[i]->isDate()) { - conjuncts.push_back(partitionKeyValues[i] - .replace(partitionKeyValues[i].find("="), 1, "='") - .append("'")); - } else { - conjuncts.push_back(partitionDirNames[i]); - } - } - return folly::join(" AND ", conjuncts); -} - -void TableWriterTestBase::verifyUnbucketedFilePath( - const std::filesystem::path& filePath, - const std::string& targetDir) { - ASSERT_EQ(filePath.parent_path().string(), targetDir); - if (commitStrategy_ == CommitStrategy::kNoCommit) { - ASSERT_TRUE(RE2::FullMatch( - filePath.filename().string(), - fmt::format( - "test_cursor.+_[0-{}]_{}_.+", - numTableWriterCount_ - 1, - tableWriteNodeId_))) - << filePath.filename().string(); - } else { - ASSERT_TRUE(RE2::FullMatch( - filePath.filename().string(), - fmt::format( - ".tmp.velox.test_cursor.+_[0-{}]_{}_.+", - numTableWriterCount_ - 1, - tableWriteNodeId_))) - << filePath.filename().string(); - } -} - -void TableWriterTestBase::verifyPartitionedFilePath( - const std::filesystem::path& filePath, - const std::string& targetDir) { - verifyPartitionedDirPath(filePath.parent_path(), targetDir); - verifyUnbucketedFilePath(filePath, filePath.parent_path().string()); -} - -void TableWriterTestBase::verifyBucketedFileName( - const std::filesystem::path& filePath) { - if (commitStrategy_ == CommitStrategy::kNoCommit) { - if (fileFormat_ == FileFormat::PARQUET) { - ASSERT_TRUE(RE2::FullMatch( - filePath.filename().string(), - "0[0-9]+_0_TaskCursorQuery_[0-9]+\\.parquet$")) - << filePath.filename().string(); - } else { - ASSERT_TRUE(RE2::FullMatch( - filePath.filename().string(), "0[0-9]+_0_TaskCursorQuery_[0-9]+")) - << filePath.filename().string(); - } - } else { - if (fileFormat_ == FileFormat::PARQUET) { - ASSERT_TRUE(RE2::FullMatch( - filePath.filename().string(), - ".tmp.velox.0[0-9]+_0_TaskCursorQuery_[0-9]+_.+\\.parquet$")) - << filePath.filename().string(); - } else { - ASSERT_TRUE(RE2::FullMatch( - filePath.filename().string(), - ".tmp.velox.0[0-9]+_0_TaskCursorQuery_[0-9]+_.+")) - << filePath.filename().string(); - } - } -} - -void TableWriterTestBase::verifyBucketedFilePath( - const std::filesystem::path& filePath, - const std::string& targetDir) { - verifyPartitionedDirPath(filePath, targetDir); - verifyBucketedFileName(filePath); -} - -void TableWriterTestBase::verifyPartitionedDirPath( - const std::filesystem::path& dirPath, - const std::string& targetDir) { - std::string regex(targetDir); - bool matched{false}; - for (int i = 0; i < partitionedBy_.size(); ++i) { - regex = fmt::format("{}/{}=.+", regex, partitionedBy_[i]); - if (RE2::FullMatch(dirPath.string(), regex)) { - matched = true; - break; - } - } - ASSERT_TRUE(matched) << dirPath; -} - -uint32_t TableWriterTestBase::parseBucketId(const std::string& bucketFileName) { - uint32_t bucketId; - if (commitStrategy_ == CommitStrategy::kNoCommit) { - VELOX_CHECK(RE2::FullMatch(bucketFileName, "(\\d+)_.+", &bucketId)); - } else { - VELOX_CHECK( - RE2::FullMatch(bucketFileName, ".tmp.velox.(\\d+)_.+", &bucketId)); - } - return bucketId; -} - -// Returns the list of partition directory names in the given directory -// path. -std::vector TableWriterTestBase::getPartitionDirNames( - const std::filesystem::path& dirPath) { - std::vector dirNames; - auto nextPath = dirPath; - for (int i = 0; i < partitionedBy_.size(); ++i) { - dirNames.push_back(nextPath.filename().string()); - nextPath = nextPath.parent_path(); - } - return dirNames; -} - -void TableWriterTestBase::verifyPartitionedFilesData( - const std::vector& filePaths, - const std::filesystem::path& dirPath) { - HiveConnectorTestBase::assertQuery( - PlanBuilder().tableScan(rowType_).planNode(), - {makeHiveConnectorSplits(filePaths)}, - fmt::format( - "SELECT c2, c3, c4, c5 FROM tmp WHERE {}", - partitionNameToPredicate(getPartitionDirNames(dirPath)))); + // Delegate to the common connector factory instead of Hive + auto& factory = connector::common::ConnectorObjectFactoryRegistry::instance() + .factoryFor(kHiveConnectorName); + return factory.makeInsertTableHandle( + tableColumnNames, + tableColumnTypes, + std::move(locationHandle), + compressionKind, + options); } -std::unique_ptr TableWriterTestBase::getBucketFunction( - const RowTypePtr& outputType) { - const auto& bucketedBy = bucketProperty_->bucketedBy(); - std::vector bucketedByChannels; - bucketedByChannels.reserve(bucketedBy.size()); - for (auto i = 0; i < bucketedBy.size(); ++i) { - const auto& bucketColumn = bucketedBy[i]; - for (column_index_t columnChannel = 0; columnChannel < outputType->size(); - ++columnChannel) { - if (outputType->nameOf(columnChannel) == bucketColumn) { - bucketedByChannels.push_back(columnChannel); - break; - } - } - } - VELOX_USER_CHECK_EQ(bucketedByChannels.size(), bucketedBy.size()); - return std::make_unique( - bucketProperty_->bucketCount(), bucketedByChannels); -} +folly::dynamic TableWriterTestBase::makeBucketPropertyOptions( + int32_t bucketCount, + const std::vector& bucketedBy, + const std::vector& bucketedTypes, + const std::vector>& sortedBy) { + // Start with required fields + folly::dynamic bp = folly::dynamic::object( + "bucketCount", bucketCount)( + "bucketedBy", folly::dynamic::array()); -void TableWriterTestBase::verifyBucketedFileData( - const std::filesystem::path& filePath, - const RowTypePtr& outputFileType) { - const std::vector filePaths = {filePath}; - // Read data from bucketed file on disk into 'rowVector'. - core::PlanNodeId scanNodeId; - auto plan = PlanBuilder() - .tableScan(outputFileType, {}, "", outputFileType) - .capturePlanNodeId(scanNodeId) - .planNode(); - const auto resultVector = - AssertQueryBuilder(plan) - .splits(scanNodeId, makeHiveConnectorSplits(filePaths)) - .copyResults(pool_.get()); - // Parse the bucket id encoded in bucketed file name. - const uint32_t expectedBucketId = parseBucketId(filePath.filename().string()); - // Compute the bucket id from read result by applying hash partition on - // bucketed columns in read result, and we expect they all match the one - // encoded in file name. - auto bucketFunction = getBucketFunction(outputFileType); - std::vector bucketIds; - bucketIds.reserve(resultVector->size()); - bucketFunction->partition(*resultVector, bucketIds); - for (const auto bucketId : bucketIds) { - ASSERT_EQ(expectedBucketId, bucketId); + // bucketedBy + for (auto& col : bucketedBy) { + bp["bucketedBy"].push_back(col); } - if (!testParam_.bucketSort()) { - return; + // bucketedTypes + bp["bucketedTypes"] = folly::dynamic::array(); + for (auto& typeStr : bucketedTypes) { + bp["bucketedTypes"].push_back(typeStr); } - // Verifies the sorting behavior - for (int i = 0; i < resultVector->size() - 1; ++i) { - for (int j = 0; j < sortColumnIndices_.size(); ++j) { - auto compareResult = - resultVector->childAt(sortColumnIndices_.at(j)) - ->compare( - resultVector->childAt(sortColumnIndices_.at(j)) - ->wrappedVector(), - i, - i + 1, - sortedFlags_[j]); - if (compareResult.has_value()) { - if (compareResult.value() < 0) { - break; - } - ASSERT_EQ(compareResult.value(), 0); - } - } + // sortedBy + bp["sortedBy"] = folly::dynamic::array(); + for (auto& entry : sortedBy) { + folly::dynamic e = folly::dynamic::object + ("column", entry.first) + ("order", entry.second); + bp["sortedBy"].push_back(std::move(e)); } + return bp; } -void TableWriterTestBase::verifyTableWriterOutput( - const std::string& targetDir, - const RowTypePtr& bucketCheckFileType, - bool verifyPartitionedData, - bool verifyBucketedData) { - SCOPED_TRACE(testParam_.toString()); - std::vector filePaths; - std::vector dirPaths; - for (auto& path : fs::recursive_directory_iterator(targetDir)) { - if (path.is_regular_file()) { - filePaths.push_back(path.path()); - } else { - dirPaths.push_back(path.path()); - } - } - if (testMode_ == TestMode::kUnpartitioned) { - ASSERT_EQ(dirPaths.size(), 0); - ASSERT_LE(filePaths.size(), numTableWriterCount_); - verifyUnbucketedFilePath(filePaths[0], targetDir); - return; - } else if (testMode_ == TestMode::kOnlyBucketed) { - ASSERT_EQ(dirPaths.size(), 0); - for (const auto& filePath : filePaths) { - ASSERT_EQ(filePath.parent_path().string(), targetDir); - verifyBucketedFileName(filePath); - if (verifyBucketedData) { - verifyBucketedFileData(filePath, bucketCheckFileType); - } - } - return; - } - // Validation for both partitioned with and without buckets. - ASSERT_EQ(numPartitionKeyValues_.size(), 2); - const auto totalPartitions = - numPartitionKeyValues_[0] * numPartitionKeyValues_[1]; - ASSERT_LE(dirPaths.size(), totalPartitions + numPartitionKeyValues_[0]); - int32_t numLeafDir{0}; - for (const auto& dirPath : dirPaths) { - verifyPartitionedDirPath(dirPath, targetDir); - if (dirPath.parent_path().string() != targetDir) { - ++numLeafDir; - } - } - if (testMode_ == TestMode::kPartitioned) { - // We expect only one file under each directory without dynamic writer - // support. - ASSERT_GE(numLeafDir * numTableWriterCount_, filePaths.size()); - for (const auto& filePath : filePaths) { - verifyPartitionedFilePath(filePath, targetDir); - if (verifyPartitionedData) { - verifyPartitionedFilesData({filePath}, filePath.parent_path()); - } - } - return; - } - ASSERT_GE(numLeafDir * bucketProperty_->bucketCount(), filePaths.size()); - std::unordered_map> - bucketFilesPerPartition; - for (const auto& filePath : filePaths) { - bucketFilesPerPartition[filePath.parent_path().string()].push_back( - filePath); - verifyBucketedFilePath(filePath, targetDir); - if (verifyBucketedData) { - verifyBucketedFileData(filePath, bucketCheckFileType); - } - } - if (verifyPartitionedData) { - for (const auto& entry : bucketFilesPerPartition) { - verifyPartitionedFilesData(entry.second, entry.second[0].parent_path()); - } - } -} +} // namespace facebook::velox::exec::test -int TableWriterTestBase::getNumWriters() { - return bucketProperty_ != nullptr ? numPartitionedTableWriterCount_ - : numTableWriterCount_; -} -} // namespace velox::exec::test +// +//#include "velox/exec/tests/utils/TableWriterTestBase.h" +// +//namespace velox::exec::test { +// +//TableWriterTestBase::TestParam::TestParam( +// FileFormat fileFormat, +// TestMode testMode, +// connector::common::CommitStrategy commitStrategy, +// HiveBucketProperty::Kind bucketKind, +// bool bucketSort, +// bool multiDrivers, +// CompressionKind compressionKind, +// bool scaleWriter) { +// value = (scaleWriter ? 1ULL << 56 : 0) | +// static_cast(compressionKind) << 48 | +// static_cast(!!multiDrivers) << 40 | +// static_cast(fileFormat) << 32 | +// static_cast(testMode) << 24 | +// static_cast(commitStrategy) << 16 | +// static_cast(bucketKind) << 8 | !!bucketSort; +//} +// +//CompressionKind TableWriterTestBase::TestParam::compressionKind() const { +// return static_cast( +// (value & ((1L << 56) - 1)) >> 48); +//} +// +//bool TableWriterTestBase::TestParam::multiDrivers() const { +// return (value >> 40) != 0; +//} +// +//FileFormat TableWriterTestBase::TestParam::fileFormat() const { +// return static_cast((value & ((1L << 40) - 1)) >> 32); +//} +// +//TableWriterTestBase::TestMode TableWriterTestBase::TestParam::testMode() const { +// return static_cast((value & ((1L << 32) - 1)) >> 24); +//} +// +//CommitStrategy TableWriterTestBase::TestParam::commitStrategy() const { +// return static_cast((value & ((1L << 24) - 1)) >> 16); +//} +// +//HiveBucketProperty::Kind TableWriterTestBase::TestParam::bucketKind() const { +// return static_cast((value & ((1L << 16) - 1)) >> 8); +//} +// +//bool TableWriterTestBase::TestParam::bucketSort() const { +// return (value & ((1L << 8) - 1)) != 0; +//} +// +//bool TableWriterTestBase::TestParam::scaleWriter() const { +// return (value >> 56) != 0; +//} +// +//std::string TableWriterTestBase::TestParam::toString() const { +// return fmt::format( +// "FileFormat[{}] TestMode[{}] commitStrategy[{}] bucketKind[{}] bucketSort[{}] multiDrivers[{}] compression[{}] scaleWriter[{}]", +// dwio::common::toString((fileFormat())), +// testModeString(testMode()), +// commitStrategyToString(commitStrategy()), +// HiveBucketProperty::kindString(bucketKind()), +// bucketSort(), +// multiDrivers(), +// compressionKindToString(compressionKind()), +// scaleWriter()); +//} +// +//std::string TableWriterTestBase::testModeString(TestMode mode) { +// switch (mode) { +// case TestMode::kUnpartitioned: +// return "UNPARTITIONED"; +// case TestMode::kPartitioned: +// return "PARTITIONED"; +// case TestMode::kBucketed: +// return "BUCKETED"; +// case TestMode::kOnlyBucketed: +// return "BUCKETED (NOT PARTITIONED)"; +// } +// VELOX_UNREACHABLE(); +//} +// +//// static +//std::shared_ptr +//TableWriterTestBase::generateAggregationNode( +// const std::string& name, +// const std::vector& groupingKeys, +// AggregationNode::Step step, +// const PlanNodePtr& source) { +// core::TypedExprPtr inputField = +// std::make_shared(BIGINT(), name); +// auto callExpr = std::make_shared( +// BIGINT(), std::vector{inputField}, "min"); +// std::vector aggregateNames = {"min"}; +// std::vector aggregates = { +// core::AggregationNode::Aggregate{ +// callExpr, {{BIGINT()}}, nullptr, {}, {}}}; +// return std::make_shared( +// core::PlanNodeId(), +// step, +// groupingKeys, +// std::vector{}, +// aggregateNames, +// aggregates, +// false, // ignoreNullKeys +// source); +//} +// +//// static. +//std::function +//TableWriterTestBase::addTableWriter( +// const RowTypePtr& inputColumns, +// const std::vector& tableColumnNames, +// const std::shared_ptr& aggregationNode, +// const std::shared_ptr& insertHandle, +// bool hasPartitioningScheme, +// connector::common::CommitStrategy commitStrategy) { +// return [=](core::PlanNodeId nodeId, +// core::PlanNodePtr source) -> core::PlanNodePtr { +// return std::make_shared( +// nodeId, +// inputColumns, +// tableColumnNames, +// aggregationNode, +// insertHandle, +// hasPartitioningScheme, +// TableWriteTraits::outputType(aggregationNode), +// commitStrategy, +// std::move(source)); +// }; +//} +// +//// static. +//RowTypePtr TableWriterTestBase::getNonPartitionsColumns( +// const std::vector& partitionedKeys, +// const RowTypePtr& rowType) { +// std::vector dataColumnNames; +// std::vector dataColumnTypes; +// for (auto i = 0; i < rowType->size(); i++) { +// auto name = rowType->names()[i]; +// if (std::find(partitionedKeys.cbegin(), partitionedKeys.cend(), name) == +// partitionedKeys.cend()) { +// dataColumnNames.emplace_back(name); +// dataColumnTypes.emplace_back(rowType->findChild(name)); +// } +// } +// return ROW(std::move(dataColumnNames), std::move(dataColumnTypes)); +//} +// +//TableWriterTestBase::TableWriterTestBase(uint64_t testValue) +// : testParam_(static_cast(testValue)), +// fileFormat_(testParam_.fileFormat()), +// testMode_(testParam_.testMode()), +// numTableWriterCount_( +// testParam_.multiDrivers() ? kNumTableWriterCount : 1), +// numPartitionedTableWriterCount_( +// testParam_.multiDrivers() ? kNumPartitionedTableWriterCount : 1), +// commitStrategy_(testParam_.commitStrategy()), +// compressionKind_(testParam_.compressionKind()), +// scaleWriter_(testParam_.scaleWriter()) { +// LOG(INFO) << testParam_.toString(); +// auto rowType = +// ROW({"c0", "c1", "c2", "c3", "c4", "c5"}, +// {BIGINT(), INTEGER(), SMALLINT(), REAL(), DOUBLE(), VARCHAR()}); +// setDataTypes(rowType); +// if (testMode_ == TestMode::kPartitioned || testMode_ == TestMode::kBucketed) { +// const std::vector partitionBy = {"c0", "c1"}; +// setPartitionBy(partitionBy); +// numPartitionKeyValues_ = {4, 4}; +// } +// if (testMode_ == TestMode::kBucketed || +// testMode_ == TestMode::kOnlyBucketed) { +// std::vector bucketedBy = {"c3", "c5"}; +// std::vector bucketedTypes = {REAL(), VARCHAR()}; +// std::vector> sortedBy; +// if (testParam_.bucketSort()) { +// // The sortedBy key shouldn't contain partitionBy key. +// sortedBy = {std::make_shared( +// "c4", core::SortOrder{true, true})}; +// // The sortColumnIndices_ should represent the indices after removing +// // the partition keys. +// if (testMode_ == TestMode::kBucketed) { +// sortColumnIndices_ = {2}; +// } else { +// sortColumnIndices_ = {4}; +// } +// sortedFlags_ = {{true, true}}; +// } +// bucketProperty_ = std::make_shared( +// testParam_.bucketKind(), 4, bucketedBy, bucketedTypes, sortedBy); +// } +//} +// +//void TableWriterTestBase::SetUp() { +// HiveConnectorTestBase::SetUp(); +//} +// +//std::shared_ptr TableWriterTestBase::assertQueryWithWriterConfigs( +// const core::PlanNodePtr& plan, +// std::vector> filePaths, +// const std::string& duckDbSql, +// bool spillEnabled) { +// std::vector splits; +// for (const auto& filePath : filePaths) { +// splits.push_back(Split(makeHiveConnectorSplit(filePath->getPath()))); +// } +// if (!spillEnabled) { +// return AssertQueryBuilder(plan, duckDbQueryRunner_) +// .maxDrivers( +// 2 * std::max(kNumTableWriterCount, kNumPartitionedTableWriterCount)) +// .config( +// QueryConfig::kTaskWriterCount, std::to_string(numTableWriterCount_)) +// .config( +// QueryConfig::kTaskPartitionedWriterCount, +// std::to_string(numPartitionedTableWriterCount_)) +// // Scale writer settings to trigger partition rebalancing. +// .config(QueryConfig::kScaleWriterRebalanceMaxMemoryUsageRatio, "1.0") +// .config( +// QueryConfig::kScaleWriterMinProcessedBytesRebalanceThreshold, "0") +// .config( +// QueryConfig:: +// kScaleWriterMinPartitionProcessedBytesRebalanceThreshold, +// "0") +// .splits(splits) +// .assertResults(duckDbSql); +// } +// const auto spillDirectory = TempDirectoryPath::create(); +// TestScopedSpillInjection scopedSpillInjection(100); +// return AssertQueryBuilder(plan, duckDbQueryRunner_) +// .spillDirectory(spillDirectory->getPath()) +// .maxDrivers( +// 2 * std::max(kNumTableWriterCount, kNumPartitionedTableWriterCount)) +// .config( +// QueryConfig::kTaskWriterCount, std::to_string(numTableWriterCount_)) +// .config( +// QueryConfig::kTaskPartitionedWriterCount, +// std::to_string(numPartitionedTableWriterCount_)) +// .config(core::QueryConfig::kSpillEnabled, "true") +// .config(QueryConfig::kWriterSpillEnabled, "true") +// // Scale writer settings to trigger partition rebalancing. +// .config(QueryConfig::kScaleWriterRebalanceMaxMemoryUsageRatio, "1.0") +// .config(QueryConfig::kScaleWriterMinProcessedBytesRebalanceThreshold, "0") +// .config( +// QueryConfig::kScaleWriterMinPartitionProcessedBytesRebalanceThreshold, +// "0") +// .splits(splits) +// .assertResults(duckDbSql); +//} +// +//std::shared_ptr TableWriterTestBase::assertQueryWithWriterConfigs( +// const core::PlanNodePtr& plan, +// const std::string& duckDbSql, +// bool enableSpill) { +// if (!enableSpill) { +// TestScopedSpillInjection scopedSpillInjection(100); +// return AssertQueryBuilder(plan, duckDbQueryRunner_) +// .maxDrivers( +// 2 * std::max(kNumTableWriterCount, kNumPartitionedTableWriterCount)) +// .config( +// QueryConfig::kTaskWriterCount, std::to_string(numTableWriterCount_)) +// .config( +// QueryConfig::kTaskPartitionedWriterCount, +// std::to_string(numPartitionedTableWriterCount_)) +// .config(core::QueryConfig::kSpillEnabled, "true") +// .config(QueryConfig::kWriterSpillEnabled, "true") +// // Scale writer settings to trigger partition rebalancing. +// .config(QueryConfig::kScaleWriterRebalanceMaxMemoryUsageRatio, "1.0") +// .config( +// QueryConfig::kScaleWriterMinProcessedBytesRebalanceThreshold, "0") +// .config( +// QueryConfig:: +// kScaleWriterMinPartitionProcessedBytesRebalanceThreshold, +// "0") +// .assertResults(duckDbSql); +// } +// const auto spillDirectory = TempDirectoryPath::create(); +// TestScopedSpillInjection scopedSpillInjection(100); +// return AssertQueryBuilder(plan, duckDbQueryRunner_) +// .spillDirectory(spillDirectory->getPath()) +// .maxDrivers( +// 2 * std::max(kNumTableWriterCount, kNumPartitionedTableWriterCount)) +// .config( +// QueryConfig::kTaskWriterCount, std::to_string(numTableWriterCount_)) +// .config( +// QueryConfig::kTaskPartitionedWriterCount, +// std::to_string(numPartitionedTableWriterCount_)) +// .config(core::QueryConfig::kSpillEnabled, "true") +// .config(QueryConfig::kWriterSpillEnabled, "true") +// // Scale writer settings to trigger partition rebalancing. +// .config(QueryConfig::kScaleWriterRebalanceMaxMemoryUsageRatio, "1.0") +// .config(QueryConfig::kScaleWriterMinProcessedBytesRebalanceThreshold, "0") +// .config( +// QueryConfig::kScaleWriterMinPartitionProcessedBytesRebalanceThreshold, +// "0") +// .assertResults(duckDbSql); +//} +// +//RowVectorPtr TableWriterTestBase::runQueryWithWriterConfigs( +// const core::PlanNodePtr& plan, +// bool spillEnabled) { +// if (!spillEnabled) { +// return AssertQueryBuilder(plan, duckDbQueryRunner_) +// .maxDrivers( +// 2 * std::max(kNumTableWriterCount, kNumPartitionedTableWriterCount)) +// .config( +// QueryConfig::kTaskWriterCount, std::to_string(numTableWriterCount_)) +// .config( +// QueryConfig::kTaskPartitionedWriterCount, +// std::to_string(numPartitionedTableWriterCount_)) +// // Scale writer settings to trigger partition rebalancing. +// .config(QueryConfig::kScaleWriterRebalanceMaxMemoryUsageRatio, "1.0") +// .config( +// QueryConfig::kScaleWriterMinProcessedBytesRebalanceThreshold, "0") +// .config( +// QueryConfig:: +// kScaleWriterMinPartitionProcessedBytesRebalanceThreshold, +// "0") +// .copyResults(pool()); +// } +// const auto spillDirectory = TempDirectoryPath::create(); +// TestScopedSpillInjection scopedSpillInjection(100); +// return AssertQueryBuilder(plan, duckDbQueryRunner_) +// .spillDirectory(spillDirectory->getPath()) +// .maxDrivers( +// 2 * std::max(kNumTableWriterCount, kNumPartitionedTableWriterCount)) +// .config( +// QueryConfig::kTaskWriterCount, std::to_string(numTableWriterCount_)) +// .config( +// QueryConfig::kTaskPartitionedWriterCount, +// std::to_string(numPartitionedTableWriterCount_)) +// .config(core::QueryConfig::kSpillEnabled, "true") +// .config(QueryConfig::kWriterSpillEnabled, "true") +// // Scale writer settings to trigger partition rebalancing. +// .config(QueryConfig::kScaleWriterRebalanceMaxMemoryUsageRatio, "1.0") +// .config(QueryConfig::kScaleWriterMinProcessedBytesRebalanceThreshold, "0") +// .config( +// QueryConfig::kScaleWriterMinPartitionProcessedBytesRebalanceThreshold, +// "0") +// .copyResults(pool()); +//} +// +//void TableWriterTestBase::setCommitStrategy(CommitStrategy commitStrategy) { +// commitStrategy_ = commitStrategy; +//} +// +//void TableWriterTestBase::setPartitionBy( +// const std::vector& partitionBy) { +// partitionedBy_ = partitionBy; +// for (const auto& partitionColumn : partitionedBy_) { +// for (int i = 0; i < rowType_->size(); ++i) { +// if (rowType_->nameOf(i) == partitionColumn) { +// partitionChannels_.emplace_back(i); +// partitionTypes_.emplace_back(rowType_->childAt(i)); +// } +// } +// } +//} +// +//void TableWriterTestBase::setBucketProperty( +// HiveBucketProperty::Kind kind, +// uint32_t bucketCount, +// const std::vector& bucketedBy, +// const std::vector& bucketedTypes, +// const std::vector>& sortedBy) { +// bucketProperty_ = std::make_shared( +// kind, bucketCount, bucketedBy, bucketedTypes, sortedBy); +//} +// +//void TableWriterTestBase::setDataTypes( +// const RowTypePtr& inputType, +// const RowTypePtr& tableSchema) { +// rowType_ = inputType; +// if (tableSchema != nullptr) { +// setTableSchema(tableSchema); +// } else { +// setTableSchema(rowType_); +// } +//} +// +//void TableWriterTestBase::setTableSchema(const RowTypePtr& tableSchema) { +// tableSchema_ = tableSchema; +//} +// +//std::vector> +//TableWriterTestBase::makeHiveConnectorSplits( +// const std::shared_ptr& directoryPath) { +// return makeHiveConnectorSplits(directoryPath->getPath()); +//} +// +//std::vector> +//TableWriterTestBase::makeHiveConnectorSplits(const std::string& directoryPath) { +// std::vector> splits; +// for (auto& path : fs::recursive_directory_iterator(directoryPath)) { +// if (path.is_regular_file()) { +// splits.push_back(HiveConnectorTestBase::makeHiveConnectorSplits( +// path.path().string(), 1, fileFormat_)[0]); +// } +// } +// return splits; +//} +// +//// Lists and returns all the regular files from a given directory +//// recursively. +//std::vector TableWriterTestBase::listAllFiles( +// const std::string& directoryPath) { +// std::vector files; +// for (auto& path : fs::recursive_directory_iterator(directoryPath)) { +// if (path.is_regular_file()) { +// files.push_back(path.path().filename()); +// } +// } +// return files; +//} +// +//// Builds and returns the hive splits from the list of files with one split +//// per each file. +//std::vector> +//TableWriterTestBase::makeHiveConnectorSplits( +// const std::vector& filePaths) { +// std::vector> splits; +// for (const auto& filePath : filePaths) { +// splits.push_back(HiveConnectorTestBase::makeHiveConnectorSplits( +// filePath.string(), 1, fileFormat_)[0]); +// } +// return splits; +//} +// +//std::vector TableWriterTestBase::makeVectors( +// int32_t numVectors, +// int32_t rowsPerVector) { +// auto rowVectors = +// HiveConnectorTestBase::makeVectors(rowType_, numVectors, rowsPerVector); +// if (testMode_ == TestMode::kUnpartitioned || +// testMode_ == TestMode::kOnlyBucketed) { +// return rowVectors; +// } +// // In case of partitioned table write test case, we ensure the number of +// // unique partition key values are capped. +// for (auto& rowVector : rowVectors) { +// auto c0PartitionVector = +// makeFlatVector(rowsPerVector, [&](auto /*unused*/) { +// return folly::Random().rand32() % numPartitionKeyValues_[0]; +// }); +// auto c1PartitionVector = +// makeFlatVector(rowsPerVector, [&](auto /*unused*/) { +// return folly::Random().rand32() % numPartitionKeyValues_[1]; +// }); +// rowVector->childAt(0) = c0PartitionVector; +// rowVector->childAt(1) = c1PartitionVector; +// } +// return rowVectors; +//} +// +//RowVectorPtr TableWriterTestBase::makeConstantVector(size_t size) { +// return makeRowVector( +// rowType_->names(), +// {makeConstant((int64_t)123'456, size), +// makeConstant((int32_t)321, size), +// makeConstant((int16_t)12'345, size), +// makeConstant(variant(TypeKind::REAL), size), +// makeConstant((double)1'234.01, size), +// makeConstant(variant(TypeKind::VARCHAR), size)}); +//} +// +//std::vector TableWriterTestBase::makeBatches( +// vector_size_t numBatches, +// std::function makeVector) { +// std::vector batches; +// batches.reserve(numBatches); +// for (int32_t i = 0; i < numBatches; ++i) { +// batches.push_back(makeVector(i)); +// } +// return batches; +//} +// +//std::set TableWriterTestBase::getLeafSubdirectories( +// const std::string& directoryPath) { +// std::set subdirectories; +// for (auto& path : fs::recursive_directory_iterator(directoryPath)) { +// if (path.is_regular_file()) { +// subdirectories.emplace(path.path().parent_path().string()); +// } +// } +// return subdirectories; +//} +// +//std::vector TableWriterTestBase::getRecursiveFiles( +// const std::string& directoryPath) { +// std::vector files; +// for (auto& path : fs::recursive_directory_iterator(directoryPath)) { +// if (path.is_regular_file()) { +// files.push_back(path.path().string()); +// } +// } +// return files; +//} +// +//uint32_t TableWriterTestBase::countRecursiveFiles( +// const std::string& directoryPath) { +// return getRecursiveFiles(directoryPath).size(); +//} +// +//// Helper method to return InsertTableHandle. +//std::shared_ptr +//TableWriterTestBase::createInsertTableHandle( +// const RowTypePtr& outputRowType, +// const connector::common::LocationHandle::TableType& outputTableType, +// const std::string& outputDirectoryPath, +// const std::vector& partitionedBy, +// const std::shared_ptr bucketProperty, +// const std::optional compressionKind) { +// return std::make_shared( +// kHiveConnectorId, +// makeHiveInsertTableHandle( +// outputRowType->names(), +// outputRowType->children(), +// partitionedBy, +// bucketProperty, +// makeLocationHandle( +// outputDirectoryPath, std::nullopt, outputTableType), +// fileFormat_, +// compressionKind)); +//} +// +//// Returns a table insert plan node. +//PlanNodePtr TableWriterTestBase::createInsertPlan( +// PlanBuilder& inputPlan, +// const RowTypePtr& outputRowType, +// const std::string& outputDirectoryPath, +// const std::vector& partitionedBy, +// std::shared_ptr bucketProperty, +// const std::optional compressionKind, +// int numTableWriters, +// const connector::common::LocationHandle::TableType& outputTableType, +// const connector::common::CommitStrategy& outputCommitStrategy, +// bool aggregateResult, +// std::shared_ptr aggregationNode) { +// return createInsertPlan( +// inputPlan, +// inputPlan.planNode()->outputType(), +// outputRowType, +// outputDirectoryPath, +// partitionedBy, +// std::move(bucketProperty), +// compressionKind, +// numTableWriters, +// outputTableType, +// outputCommitStrategy, +// aggregateResult, +// aggregationNode); +//} +// +//PlanNodePtr TableWriterTestBase::createInsertPlan( +// PlanBuilder& inputPlan, +// const RowTypePtr& inputRowType, +// const RowTypePtr& tableRowType, +// const std::string& outputDirectoryPath, +// const std::vector& partitionedBy, +// std::shared_ptr bucketProperty, +// const std::optional compressionKind, +// int numTableWriters, +// const connector::common::LocationHandle::TableType& outputTableType, +// const connector::common::CommitStrategy& outputCommitStrategy, +// bool aggregateResult, +// std::shared_ptr aggregationNode) { +// if (numTableWriters == 1) { +// return createInsertPlanWithSingleWriter( +// inputPlan, +// inputRowType, +// tableRowType, +// outputDirectoryPath, +// partitionedBy, +// bucketProperty, +// compressionKind, +// outputTableType, +// outputCommitStrategy, +// aggregateResult, +// aggregationNode); +// } else if (bucketProperty_ == nullptr) { +// return createInsertPlanWithForNonBucketedTable( +// inputPlan, +// inputRowType, +// tableRowType, +// outputDirectoryPath, +// partitionedBy, +// compressionKind, +// outputTableType, +// outputCommitStrategy, +// aggregateResult, +// aggregationNode); +// } else { +// return createInsertPlanForBucketTable( +// inputPlan, +// inputRowType, +// tableRowType, +// outputDirectoryPath, +// partitionedBy, +// bucketProperty, +// compressionKind, +// outputTableType, +// outputCommitStrategy, +// aggregateResult, +// aggregationNode); +// } +//} +// +//PlanNodePtr TableWriterTestBase::createInsertPlanWithSingleWriter( +// PlanBuilder& inputPlan, +// const RowTypePtr& inputRowType, +// const RowTypePtr& tableRowType, +// const std::string& outputDirectoryPath, +// const std::vector& partitionedBy, +// std::shared_ptr bucketProperty, +// const std::optional compressionKind, +// const connector::common::LocationHandle::TableType& outputTableType, +// const connector::common::CommitStrategy& outputCommitStrategy, +// bool aggregateResult, +// std::shared_ptr aggregationNode) { +// const bool addScaleWriterExchange = +// scaleWriter_ && (bucketProperty != nullptr); +// auto insertPlan = inputPlan; +// if (addScaleWriterExchange) { +// if (!partitionedBy.empty()) { +// insertPlan.scaleWriterlocalPartition( +// inputColumnNames(partitionedBy, tableRowType, inputRowType)); +// } else { +// insertPlan.scaleWriterlocalPartitionRoundRobin(); +// } +// } +// insertPlan +// .addNode(addTableWriter( +// inputRowType, +// tableRowType->names(), +// aggregationNode, +// createInsertTableHandle( +// tableRowType, +// outputTableType, +// outputDirectoryPath, +// partitionedBy, +// bucketProperty, +// compressionKind), +// false, +// outputCommitStrategy)) +// .capturePlanNodeId(tableWriteNodeId_); +// if (addScaleWriterExchange) { +// if (!partitionedBy.empty()) { +// insertPlan.scaleWriterlocalPartition( +// inputColumnNames(partitionedBy, tableRowType, inputRowType)); +// } else { +// insertPlan.scaleWriterlocalPartitionRoundRobin(); +// } +// } +// if (aggregateResult) { +// insertPlan.project({TableWriteTraits::rowCountColumnName()}) +// .singleAggregation( +// {}, +// {fmt::format("sum({})", TableWriteTraits::rowCountColumnName())}); +// } +// return insertPlan.planNode(); +//} +// +//PlanNodePtr TableWriterTestBase::createInsertPlanForBucketTable( +// PlanBuilder& inputPlan, +// const RowTypePtr& inputRowType, +// const RowTypePtr& tableRowType, +// const std::string& outputDirectoryPath, +// const std::vector& partitionedBy, +// std::shared_ptr bucketProperty, +// const std::optional compressionKind, +// const connector::common::LocationHandle::TableType& outputTableType, +// const connector::common::CommitStrategy& outputCommitStrategy, +// bool aggregateResult, +// std::shared_ptr aggregationNode) { +// // Since we might do column rename, so generate bucket property based on +// // the data type from 'inputPlan'. +// std::vector bucketColumns; +// bucketColumns.reserve(bucketProperty->bucketedBy().size()); +// for (int i = 0; i < bucketProperty->bucketedBy().size(); ++i) { +// bucketColumns.push_back(inputRowType->names()[tableRowType->getChildIdx( +// bucketProperty->bucketedBy()[i])]); +// } +// auto localPartitionBucketProperty = std::make_shared( +// bucketProperty->kind(), +// bucketProperty->bucketCount(), +// bucketColumns, +// bucketProperty->bucketedTypes(), +// bucketProperty->sortedBy()); +// auto insertPlan = +// inputPlan.localPartitionByBucket(localPartitionBucketProperty) +// .addNode(addTableWriter( +// inputRowType, +// tableRowType->names(), +// nullptr, +// createInsertTableHandle( +// tableRowType, +// outputTableType, +// outputDirectoryPath, +// partitionedBy, +// bucketProperty, +// compressionKind), +// false, +// outputCommitStrategy)) +// .capturePlanNodeId(tableWriteNodeId_) +// .localPartition({}) +// .tableWriteMerge(); +// if (aggregateResult) { +// insertPlan.project({TableWriteTraits::rowCountColumnName()}) +// .singleAggregation( +// {}, +// {fmt::format("sum({})", TableWriteTraits::rowCountColumnName())}); +// } +// return insertPlan.planNode(); +//} +// +//// static +//std::vector TableWriterTestBase::inputColumnNames( +// const std::vector& tableColumnNames, +// const RowTypePtr& tableRowType, +// const RowTypePtr& inputRowType) { +// std::vector inputNames; +// inputNames.reserve(tableColumnNames.size()); +// for (const auto& tableColumnName : tableColumnNames) { +// const auto columnIdx = tableRowType->getChildIdx(tableColumnName); +// inputNames.push_back(inputRowType->nameOf(columnIdx)); +// } +// return inputNames; +//} +// +//PlanNodePtr TableWriterTestBase::createInsertPlanWithForNonBucketedTable( +// PlanBuilder& inputPlan, +// const RowTypePtr& inputRowType, +// const RowTypePtr& tableRowType, +// const std::string& outputDirectoryPath, +// const std::vector& partitionedBy, +// const std::optional compressionKind, +// const connector::common::LocationHandle::TableType& outputTableType, +// const connector::common::CommitStrategy& outputCommitStrategy, +// bool aggregateResult, +// std::shared_ptr aggregationNode) { +// auto insertPlan = inputPlan; +// if (scaleWriter_) { +// if (!partitionedBy.empty()) { +// insertPlan.scaleWriterlocalPartition( +// inputColumnNames(partitionedBy, tableRowType, inputRowType)); +// } else { +// insertPlan.scaleWriterlocalPartitionRoundRobin(); +// } +// } +// insertPlan +// .addNode(addTableWriter( +// inputRowType, +// tableRowType->names(), +// nullptr, +// createInsertTableHandle( +// tableRowType, +// outputTableType, +// outputDirectoryPath, +// partitionedBy, +// nullptr, +// compressionKind), +// false, +// outputCommitStrategy)) +// .capturePlanNodeId(tableWriteNodeId_) +// .localPartition(std::vector{}) +// .tableWriteMerge(); +// if (aggregateResult) { +// insertPlan.project({TableWriteTraits::rowCountColumnName()}) +// .singleAggregation( +// {}, +// {fmt::format("sum({})", TableWriteTraits::rowCountColumnName())}); +// } +// return insertPlan.planNode(); +//} +// +//std::string TableWriterTestBase::partitionNameToPredicate( +// const std::string& partitionName, +// const std::vector& partitionTypes) { +// std::vector conjuncts; +// std::vector partitionKeyValues; +// folly::split('/', partitionName, partitionKeyValues); +// VELOX_CHECK_EQ(partitionKeyValues.size(), partitionTypes.size()); +// for (auto i = 0; i < partitionKeyValues.size(); ++i) { +// if (partitionTypes[i]->isVarchar() || partitionTypes[i]->isVarbinary() || +// partitionTypes[i]->isDate()) { +// conjuncts.push_back(partitionKeyValues[i] +// .replace(partitionKeyValues[i].find("="), 1, "='") +// .append("'")); +// } else { +// conjuncts.push_back(partitionKeyValues[i]); +// } +// } +// return folly::join(" AND ", conjuncts); +//} +// +//std::string TableWriterTestBase::partitionNameToPredicate( +// const std::vector& partitionDirNames) { +// std::vector conjuncts; +// VELOX_CHECK_EQ(partitionDirNames.size(), partitionTypes_.size()); +// std::vector partitionKeyValues = partitionDirNames; +// for (auto i = 0; i < partitionDirNames.size(); ++i) { +// if (partitionTypes_[i]->isVarchar() || partitionTypes_[i]->isVarbinary() || +// partitionTypes_[i]->isDate()) { +// conjuncts.push_back(partitionKeyValues[i] +// .replace(partitionKeyValues[i].find("="), 1, "='") +// .append("'")); +// } else { +// conjuncts.push_back(partitionDirNames[i]); +// } +// } +// return folly::join(" AND ", conjuncts); +//} +// +//void TableWriterTestBase::verifyUnbucketedFilePath( +// const std::filesystem::path& filePath, +// const std::string& targetDir) { +// ASSERT_EQ(filePath.parent_path().string(), targetDir); +// if (commitStrategy_ == connector::common::CommitStrategy::kNoCommit) { +// ASSERT_TRUE(RE2::FullMatch( +// filePath.filename().string(), +// fmt::format( +// "test_cursor.+_[0-{}]_{}_.+", +// numTableWriterCount_ - 1, +// tableWriteNodeId_))) +// << filePath.filename().string(); +// } else { +// ASSERT_TRUE(RE2::FullMatch( +// filePath.filename().string(), +// fmt::format( +// ".tmp.velox.test_cursor.+_[0-{}]_{}_.+", +// numTableWriterCount_ - 1, +// tableWriteNodeId_))) +// << filePath.filename().string(); +// } +//} +// +//void TableWriterTestBase::verifyPartitionedFilePath( +// const std::filesystem::path& filePath, +// const std::string& targetDir) { +// verifyPartitionedDirPath(filePath.parent_path(), targetDir); +// verifyUnbucketedFilePath(filePath, filePath.parent_path().string()); +//} +// +//void TableWriterTestBase::verifyBucketedFileName( +// const std::filesystem::path& filePath) { +// if (commitStrategy_ == connector::common::CommitStrategy::kNoCommit) { +// if (fileFormat_ == FileFormat::PARQUET) { +// ASSERT_TRUE(RE2::FullMatch( +// filePath.filename().string(), +// "0[0-9]+_0_TaskCursorQuery_[0-9]+\\.parquet$")) +// << filePath.filename().string(); +// } else { +// ASSERT_TRUE(RE2::FullMatch( +// filePath.filename().string(), "0[0-9]+_0_TaskCursorQuery_[0-9]+")) +// << filePath.filename().string(); +// } +// } else { +// if (fileFormat_ == FileFormat::PARQUET) { +// ASSERT_TRUE(RE2::FullMatch( +// filePath.filename().string(), +// ".tmp.velox.0[0-9]+_0_TaskCursorQuery_[0-9]+_.+\\.parquet$")) +// << filePath.filename().string(); +// } else { +// ASSERT_TRUE(RE2::FullMatch( +// filePath.filename().string(), +// ".tmp.velox.0[0-9]+_0_TaskCursorQuery_[0-9]+_.+")) +// << filePath.filename().string(); +// } +// } +//} +// +//void TableWriterTestBase::verifyBucketedFilePath( +// const std::filesystem::path& filePath, +// const std::string& targetDir) { +// verifyPartitionedDirPath(filePath, targetDir); +// verifyBucketedFileName(filePath); +//} +// +//void TableWriterTestBase::verifyPartitionedDirPath( +// const std::filesystem::path& dirPath, +// const std::string& targetDir) { +// std::string regex(targetDir); +// bool matched{false}; +// for (int i = 0; i < partitionedBy_.size(); ++i) { +// regex = fmt::format("{}/{}=.+", regex, partitionedBy_[i]); +// if (RE2::FullMatch(dirPath.string(), regex)) { +// matched = true; +// break; +// } +// } +// ASSERT_TRUE(matched) << dirPath; +//} +// +//uint32_t TableWriterTestBase::parseBucketId(const std::string& bucketFileName) { +// uint32_t bucketId; +// if (commitStrategy_ == connector::common::CommitStrategy::kNoCommit) { +// VELOX_CHECK(RE2::FullMatch(bucketFileName, "(\\d+)_.+", &bucketId)); +// } else { +// VELOX_CHECK( +// RE2::FullMatch(bucketFileName, ".tmp.velox.(\\d+)_.+", &bucketId)); +// } +// return bucketId; +//} +// +//// Returns the list of partition directory names in the given directory +//// path. +//std::vector TableWriterTestBase::getPartitionDirNames( +// const std::filesystem::path& dirPath) { +// std::vector dirNames; +// auto nextPath = dirPath; +// for (int i = 0; i < partitionedBy_.size(); ++i) { +// dirNames.push_back(nextPath.filename().string()); +// nextPath = nextPath.parent_path(); +// } +// return dirNames; +//} +// +//void TableWriterTestBase::verifyPartitionedFilesData( +// const std::vector& filePaths, +// const std::filesystem::path& dirPath) { +// HiveConnectorTestBase::assertQuery( +// PlanBuilder().tableScan(rowType_).planNode(), +// {makeHiveConnectorSplits(filePaths)}, +// fmt::format( +// "SELECT c2, c3, c4, c5 FROM tmp WHERE {}", +// partitionNameToPredicate(getPartitionDirNames(dirPath)))); +//} +// +//std::unique_ptr TableWriterTestBase::getBucketFunction( +// const RowTypePtr& outputType) { +// const auto& bucketedBy = bucketProperty_->bucketedBy(); +// std::vector bucketedByChannels; +// bucketedByChannels.reserve(bucketedBy.size()); +// for (auto i = 0; i < bucketedBy.size(); ++i) { +// const auto& bucketColumn = bucketedBy[i]; +// for (column_index_t columnChannel = 0; columnChannel < outputType->size(); +// ++columnChannel) { +// if (outputType->nameOf(columnChannel) == bucketColumn) { +// bucketedByChannels.push_back(columnChannel); +// break; +// } +// } +// } +// VELOX_USER_CHECK_EQ(bucketedByChannels.size(), bucketedBy.size()); +// return std::make_unique( +// bucketProperty_->bucketCount(), bucketedByChannels); +//} +// +//void TableWriterTestBase::verifyBucketedFileData( +// const std::filesystem::path& filePath, +// const RowTypePtr& outputFileType) { +// const std::vector filePaths = {filePath}; +// // Read data from bucketed file on disk into 'rowVector'. +// core::PlanNodeId scanNodeId; +// auto plan = PlanBuilder() +// .tableScan(outputFileType, {}, "", outputFileType) +// .capturePlanNodeId(scanNodeId) +// .planNode(); +// const auto resultVector = +// AssertQueryBuilder(plan) +// .splits(scanNodeId, makeHiveConnectorSplits(filePaths)) +// .copyResults(pool_.get()); +// // Parse the bucket id encoded in bucketed file name. +// const uint32_t expectedBucketId = parseBucketId(filePath.filename().string()); +// // Compute the bucket id from read result by applying hash partition on +// // bucketed columns in read result, and we expect they all match the one +// // encoded in file name. +// auto bucketFunction = getBucketFunction(outputFileType); +// std::vector bucketIds; +// bucketIds.reserve(resultVector->size()); +// bucketFunction->partition(*resultVector, bucketIds); +// for (const auto bucketId : bucketIds) { +// ASSERT_EQ(expectedBucketId, bucketId); +// } +// if (!testParam_.bucketSort()) { +// return; +// } +// // Verifies the sorting behavior +// for (int i = 0; i < resultVector->size() - 1; ++i) { +// for (int j = 0; j < sortColumnIndices_.size(); ++j) { +// auto compareResult = +// resultVector->childAt(sortColumnIndices_.at(j)) +// ->compare( +// resultVector->childAt(sortColumnIndices_.at(j)) +// ->wrappedVector(), +// i, +// i + 1, +// sortedFlags_[j]); +// if (compareResult.has_value()) { +// if (compareResult.value() < 0) { +// break; +// } +// ASSERT_EQ(compareResult.value(), 0); +// } +// } +// } +//} +// +//void TableWriterTestBase::verifyTableWriterOutput( +// const std::string& targetDir, +// const RowTypePtr& bucketCheckFileType, +// bool verifyPartitionedData, +// bool verifyBucketedData) { +// SCOPED_TRACE(testParam_.toString()); +// std::vector filePaths; +// std::vector dirPaths; +// for (auto& path : fs::recursive_directory_iterator(targetDir)) { +// if (path.is_regular_file()) { +// filePaths.push_back(path.path()); +// } else { +// dirPaths.push_back(path.path()); +// } +// } +// if (testMode_ == TestMode::kUnpartitioned) { +// ASSERT_EQ(dirPaths.size(), 0); +// ASSERT_LE(filePaths.size(), numTableWriterCount_); +// verifyUnbucketedFilePath(filePaths[0], targetDir); +// return; +// } else if (testMode_ == TestMode::kOnlyBucketed) { +// ASSERT_EQ(dirPaths.size(), 0); +// for (const auto& filePath : filePaths) { +// ASSERT_EQ(filePath.parent_path().string(), targetDir); +// verifyBucketedFileName(filePath); +// if (verifyBucketedData) { +// verifyBucketedFileData(filePath, bucketCheckFileType); +// } +// } +// return; +// } +// // Validation for both partitioned with and without buckets. +// ASSERT_EQ(numPartitionKeyValues_.size(), 2); +// const auto totalPartitions = +// numPartitionKeyValues_[0] * numPartitionKeyValues_[1]; +// ASSERT_LE(dirPaths.size(), totalPartitions + numPartitionKeyValues_[0]); +// int32_t numLeafDir{0}; +// for (const auto& dirPath : dirPaths) { +// verifyPartitionedDirPath(dirPath, targetDir); +// if (dirPath.parent_path().string() != targetDir) { +// ++numLeafDir; +// } +// } +// if (testMode_ == TestMode::kPartitioned) { +// // We expect only one file under each directory without dynamic writer +// // support. +// ASSERT_GE(numLeafDir * numTableWriterCount_, filePaths.size()); +// for (const auto& filePath : filePaths) { +// verifyPartitionedFilePath(filePath, targetDir); +// if (verifyPartitionedData) { +// verifyPartitionedFilesData({filePath}, filePath.parent_path()); +// } +// } +// return; +// } +// ASSERT_GE(numLeafDir * bucketProperty_->bucketCount(), filePaths.size()); +// std::unordered_map> +// bucketFilesPerPartition; +// for (const auto& filePath : filePaths) { +// bucketFilesPerPartition[filePath.parent_path().string()].push_back( +// filePath); +// verifyBucketedFilePath(filePath, targetDir); +// if (verifyBucketedData) { +// verifyBucketedFileData(filePath, bucketCheckFileType); +// } +// } +// if (verifyPartitionedData) { +// for (const auto& entry : bucketFilesPerPartition) { +// verifyPartitionedFilesData(entry.second, entry.second[0].parent_path()); +// } +// } +//} +// +//int TableWriterTestBase::getNumWriters() { +// return bucketProperty_ != nullptr ? numPartitionedTableWriterCount_ +// : numTableWriterCount_; +//} +//} // namespace velox::exec::test diff --git a/velox/exec/tests/utils/TableWriterTestBase.h b/velox/exec/tests/utils/TableWriterTestBase.h index a98e4e44d722..3f6ff1e693a2 100644 --- a/velox/exec/tests/utils/TableWriterTestBase.h +++ b/velox/exec/tests/utils/TableWriterTestBase.h @@ -13,364 +13,403 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "folly/dynamic.h" -#include "velox/common/base/Fs.h" -#include "velox/common/base/tests/GTestUtils.h" -#include "velox/common/hyperloglog/SparseHll.h" -#include "velox/common/testutil/TestValue.h" -#include "velox/connectors/hive/HiveConfig.h" -#include "velox/connectors/hive/HivePartitionFunction.h" -#include "velox/dwio/common/WriterFactory.h" -#include "velox/exec/PlanNodeStats.h" -#include "velox/exec/TableWriter.h" -#include "velox/exec/tests/utils/AssertQueryBuilder.h" -#include "velox/exec/tests/utils/HiveConnectorTestBase.h" -#include "velox/exec/tests/utils/PlanBuilder.h" -#include "velox/exec/tests/utils/TempDirectoryPath.h" -#include "velox/vector/fuzzer/VectorFuzzer.h" -#include -#include -#include "folly/experimental/EventCount.h" -#include "velox/common/memory/MemoryArbitrator.h" -#include "velox/dwio/common/Options.h" -#include "velox/dwio/dwrf/writer/Writer.h" -#include "velox/exec/tests/utils/ArbitratorTestUtil.h" +#pragma once -namespace velox::exec::test { -using namespace facebook::velox; -using namespace facebook::velox::core; -using namespace facebook::velox::common; -using namespace facebook::velox::exec; -using namespace facebook::velox::exec::test; -using namespace facebook::velox::connector; -using namespace facebook::velox::connector::hive; -using namespace facebook::velox::dwio::common; -using namespace facebook::velox::common::testutil; -using namespace facebook::velox::common::hll; +#include "velox/exec/tests/utils/OperatorTestBase.h" +#include "velox/connectors/common/ConnectorObjectFactoryRegistry.h" +#include "velox/connectors/common/ConnectorNames.h" +#include "velox/connectors/common/Connector.h" +#include "velox/common/compression/CompressionKind.h" +#include -class TableWriterTestBase : public HiveConnectorTestBase { - public: - enum class TestMode { - kUnpartitioned, - kPartitioned, - kBucketed, - kOnlyBucketed, - }; - - static std::string testModeString(TestMode mode); - - // NOTE: google parameterized test framework can't handle complex test - // parameters properly. So we encode the different test parameters into one - // integer value. - struct TestParam { - uint64_t value; - - explicit TestParam(uint64_t _value) : value(_value) {} - - TestParam( - FileFormat fileFormat, - TestMode testMode, - CommitStrategy commitStrategy, - HiveBucketProperty::Kind bucketKind, - bool bucketSort, - bool multiDrivers, - CompressionKind compressionKind, - bool scaleWriter); - - CompressionKind compressionKind() const; - - bool multiDrivers() const; - - FileFormat fileFormat() const; - - TestMode testMode() const; - - CommitStrategy commitStrategy() const; - - HiveBucketProperty::Kind bucketKind() const; - - bool bucketSort() const; - - bool scaleWriter() const; - - std::string toString() const; - }; - - protected: - explicit TableWriterTestBase(uint64_t testValue); - - void SetUp() override; - - static std::function addTableWriter( - const RowTypePtr& inputColumns, - const std::vector& tableColumnNames, - const std::shared_ptr& aggregationNode, - const std::shared_ptr& insertHandle, - bool hasPartitioningScheme, - connector::CommitStrategy commitStrategy = - connector::CommitStrategy::kNoCommit); - - static RowTypePtr getNonPartitionsColumns( - const std::vector& partitionedKeys, - const RowTypePtr& rowType); - - static std::shared_ptr generateAggregationNode( - const std::string& name, - const std::vector& groupingKeys, - AggregationNode::Step step, - const PlanNodePtr& source); - - std::shared_ptr assertQueryWithWriterConfigs( - const core::PlanNodePtr& plan, - std::vector> filePaths, - const std::string& duckDbSql, - bool spillEnabled = false); - - std::shared_ptr assertQueryWithWriterConfigs( - const core::PlanNodePtr& plan, - const std::string& duckDbSql, - bool enableSpill = false); - - RowVectorPtr runQueryWithWriterConfigs( - const core::PlanNodePtr& plan, - bool spillEnabled = false); - - void setCommitStrategy(CommitStrategy commitStrategy); - - void setPartitionBy(const std::vector& partitionBy); - - void setBucketProperty( - HiveBucketProperty::Kind kind, - uint32_t bucketCount, - const std::vector& bucketedBy, - const std::vector& bucketedTypes, - const std::vector>& sortedBy = - {}); - - void setDataTypes( - const RowTypePtr& inputType, - const RowTypePtr& tableSchema = nullptr); - - void setTableSchema(const RowTypePtr& tableSchema); - - std::vector> - makeHiveConnectorSplits( - const std::shared_ptr& directoryPath); - - std::vector> - makeHiveConnectorSplits(const std::string& directoryPath); - - // Lists and returns all the regular files from a given directory recursively. - std::vector listAllFiles(const std::string& directoryPath); - - // Builds and returns the hive splits from the list of files with one split - // per each file. - std::vector> - makeHiveConnectorSplits(const std::vector& filePaths); - - std::vector makeVectors( - int32_t numVectors, - int32_t rowsPerVector); +namespace facebook::velox::exec::test { - RowVectorPtr makeConstantVector(size_t size); - - std::vector makeBatches( - vector_size_t numBatches, - std::function makeVector); - - std::set getLeafSubdirectories(const std::string& directoryPath); - - std::vector getRecursiveFiles(const std::string& directoryPath); - - uint32_t countRecursiveFiles(const std::string& directoryPath); - - // Helper method to return InsertTableHandle. - std::shared_ptr createInsertTableHandle( - const RowTypePtr& outputRowType, - const connector::hive::LocationHandle::TableType& outputTableType, - const std::string& outputDirectoryPath, - const std::vector& partitionedBy, - const std::shared_ptr bucketProperty, - const std::optional compressionKind = {}); - - // Returns a table insert plan node. - PlanNodePtr createInsertPlan( - PlanBuilder& inputPlan, - const RowTypePtr& outputRowType, - const std::string& outputDirectoryPath, - const std::vector& partitionedBy = {}, - std::shared_ptr bucketProperty = {}, - const std::optional compressionKind = {}, - int numTableWriters = 1, - const connector::hive::LocationHandle::TableType& outputTableType = - connector::hive::LocationHandle::TableType::kNew, - const CommitStrategy& outputCommitStrategy = CommitStrategy::kNoCommit, - bool aggregateResult = true, - std::shared_ptr aggregationNode = nullptr); - - PlanNodePtr createInsertPlan( - PlanBuilder& inputPlan, - const RowTypePtr& inputRowType, - const RowTypePtr& tableRowType, - const std::string& outputDirectoryPath, - const std::vector& partitionedBy = {}, - std::shared_ptr bucketProperty = {}, - const std::optional compressionKind = {}, - int numTableWriters = 1, - const connector::hive::LocationHandle::TableType& outputTableType = - connector::hive::LocationHandle::TableType::kNew, - const CommitStrategy& outputCommitStrategy = CommitStrategy::kNoCommit, - bool aggregateResult = true, - std::shared_ptr aggregationNode = nullptr); - - PlanNodePtr createInsertPlanWithSingleWriter( - PlanBuilder& inputPlan, - const RowTypePtr& inputRowType, - const RowTypePtr& tableRowType, - const std::string& outputDirectoryPath, - const std::vector& partitionedBy, - std::shared_ptr bucketProperty, - const std::optional compressionKind, - const connector::hive::LocationHandle::TableType& outputTableType, - const CommitStrategy& outputCommitStrategy, - bool aggregateResult, - std::shared_ptr aggregationNode); - - PlanNodePtr createInsertPlanForBucketTable( - PlanBuilder& inputPlan, - const RowTypePtr& inputRowType, - const RowTypePtr& tableRowType, - const std::string& outputDirectoryPath, - const std::vector& partitionedBy, - std::shared_ptr bucketProperty, - const std::optional compressionKind, - const connector::hive::LocationHandle::TableType& outputTableType, - const CommitStrategy& outputCommitStrategy, - bool aggregateResult, - std::shared_ptr aggregationNode); - - // Return the corresponding column names in 'inputRowType' of - // 'tableColumnNames' from 'tableRowType'. - static std::vector inputColumnNames( +class TableWriterTestBase : public OperatorTestBase { + public: + /// Creates a connector::common::ConnectorInsertTableHandle via the common factory, serializing + /// HiveBucketProperty into options. + static std::shared_ptr makeInsertTableHandle( const std::vector& tableColumnNames, - const RowTypePtr& tableRowType, - const RowTypePtr& inputRowType); - - PlanNodePtr createInsertPlanWithForNonBucketedTable( - PlanBuilder& inputPlan, - const RowTypePtr& inputRowType, - const RowTypePtr& tableRowType, - const std::string& outputDirectoryPath, + const std::vector& tableColumnTypes, const std::vector& partitionedBy, - const std::optional compressionKind, - const connector::hive::LocationHandle::TableType& outputTableType, - const CommitStrategy& outputCommitStrategy, - bool aggregateResult, - std::shared_ptr aggregationNode); - - // Parameter partitionName is string formatted in the Hive style - // key1=value1/key2=value2/... Parameter partitionTypes are types of partition - // keys in the same order as in partitionName.The return value is a SQL - // predicate with values single quoted for string and date and not quoted for - // other supported types, ex., key1='value1' AND key2=value2 AND ... - std::string partitionNameToPredicate( - const std::string& partitionName, - const std::vector& partitionTypes); - - std::string partitionNameToPredicate( - const std::vector& partitionDirNames); - - // Verifies if a unbucketed file name is encoded properly based on the - // used commit strategy. - void verifyUnbucketedFilePath( - const std::filesystem::path& filePath, - const std::string& targetDir); - - // Verifies if a partitioned file path (directory and file name) is encoded - // properly. - void verifyPartitionedFilePath( - const std::filesystem::path& filePath, - const std::string& targetDir); - - // Verifies if the bucket file name is encoded properly. - void verifyBucketedFileName(const std::filesystem::path& filePath); - - // Verifies if a bucketed file path (directory and file name) is encoded - // properly. - void verifyBucketedFilePath( - const std::filesystem::path& filePath, - const std::string& targetDir); - - // Verifies if the given partitioned table directory (names) are encoded - // properly based on the used partitioned keys. - void verifyPartitionedDirPath( - const std::filesystem::path& dirPath, - const std::string& targetDir); - - // Parses and returns the bucket id encoded in the bucketed file name. - uint32_t parseBucketId(const std::string& bucketFileName); - - // Returns the list of partition directory names in the given directory path. - std::vector getPartitionDirNames( - const std::filesystem::path& dirPath); - - // Verifies the partitioned file data on disk by comparing with the same set - // of data read from duckbd. - void verifyPartitionedFilesData( - const std::vector& filePaths, - const std::filesystem::path& dirPath); - - // Gets the hash function used by the production code to build bucket id. - std::unique_ptr getBucketFunction( - const RowTypePtr& outputType); - - // Verifies the bucketed file data by checking if the bucket id of each read - // row is the same as the one encoded in the corresponding bucketed file name. - void verifyBucketedFileData( - const std::filesystem::path& filePath, - const RowTypePtr& outputFileType); - - // Verifies the file layout and data produced by a table writer. - void verifyTableWriterOutput( - const std::string& targetDir, - const RowTypePtr& bucketCheckFileType, - bool verifyPartitionedData = true, - bool verifyBucketedData = true); - - int getNumWriters(); - - protected: - static inline int kNumTableWriterCount = 4; - static inline int kNumPartitionedTableWriterCount = 2; - - const TestParam testParam_; - const FileFormat fileFormat_; - const TestMode testMode_; - const int numTableWriterCount_; - const int numPartitionedTableWriterCount_; - const std::shared_ptr planNodeIdGenerator_{ - std::make_shared()}; - - RowTypePtr rowType_; - RowTypePtr tableSchema_; - CommitStrategy commitStrategy_; - std::optional compressionKind_; - bool scaleWriter_; - std::vector partitionedBy_; - std::vector partitionTypes_; - std::vector partitionChannels_; - std::vector numPartitionKeyValues_; - std::vector sortColumnIndices_; - std::vector sortedFlags_; - std::shared_ptr bucketProperty_{nullptr}; - core::PlanNodeId tableWriteNodeId_; + std::shared_ptr locationHandle, + dwio::common::FileFormat tableStorageFormat, + const std::optional compressionKind, + const std::unordered_map& serdeParameters, + const std::shared_ptr& writerOptions, + bool ensureFiles, + const folly::dynamic& bucketPropertyOptions); + + /// Helper for TableWriter tests: packages Hive bucket‐property settings + /// into a folly::dynamic. Does not require Hive headers. + static folly::dynamic makeBucketPropertyOptions( + int32_t bucketCount, + const std::vector& bucketedBy, + const std::vector& bucketedTypes, + const std::vector>& sortedBy); }; -FOLLY_ALWAYS_INLINE std::ostream& operator<<( - std::ostream& os, - TableWriterTestBase::TestMode mode) { - os << TableWriterTestBase::testModeString(mode); - return os; -} -} // namespace velox::exec::test +} // namespace facebook::velox::exec::test + +//#include "folly/dynamic.h" +//#include "velox/common/base/Fs.h" +//#include "velox/common/base/tests/GTestUtils.h" +//#include "velox/common/hyperloglog/SparseHll.h" +//#include "velox/common/testutil/TestValue.h" +//#include "velox/connectors/hive/HiveConfig.h" +//#include "velox/connectors/hive/HivePartitionFunction.h" +//#include "velox/dwio/common/WriterFactory.h" +//#include "velox/exec/PlanNodeStats.h" +//#include "velox/exec/TableWriter.h" +//#include "velox/exec/tests/utils/AssertQueryBuilder.h" +//#include "velox/exec/tests/utils/HiveConnectorTestBase.h" +//#include "velox/exec/tests/utils/PlanBuilder.h" +//#include "velox/exec/tests/utils/TempDirectoryPath.h" +//#include "velox/vector/fuzzer/VectorFuzzer.h" +// +//#include +//#include +//#include "folly/experimental/EventCount.h" +//#include "velox/common/memory/MemoryArbitrator.h" +//#include "velox/dwio/common/Options.h" +//#include "velox/dwio/dwrf/writer/Writer.h" +//#include "velox/exec/tests/utils/ArbitratorTestUtil.h" +// +//namespace velox::exec::test { +//using namespace facebook::velox; +//using namespace facebook::velox::core; +//using namespace facebook::velox::common; +//using namespace facebook::velox::exec; +//using namespace facebook::velox::exec::test; +//using namespace facebook::velox::connector; +//using namespace facebook::velox::connector::hive; +//using namespace facebook::velox::dwio::common; +//using namespace facebook::velox::common::testutil; +//using namespace facebook::velox::common::hll; +// +//class TableWriterTestBase : public HiveConnectorTestBase { +// public: +// enum class TestMode { +// kUnpartitioned, +// kPartitioned, +// kBucketed, +// kOnlyBucketed, +// }; +// +// static std::string testModeString(TestMode mode); +// +// // NOTE: google parameterized test framework can't handle complex test +// // parameters properly. So we encode the different test parameters into one +// // integer value. +// struct TestParam { +// uint64_t value; +// +// explicit TestParam(uint64_t _value) : value(_value) {} +// +// TestParam( +// FileFormat fileFormat, +// TestMode testMode, +// connector::common::CommitStrategy commitStrategy, +// HiveBucketProperty::Kind bucketKind, +// bool bucketSort, +// bool multiDrivers, +// CompressionKind compressionKind, +// bool scaleWriter); +// +// CompressionKind compressionKind() const; +// +// bool multiDrivers() const; +// +// FileFormat fileFormat() const; +// +// TestMode testMode() const; +// +// connector::common::CommitStrategy commitStrategy() const; +// +// HiveBucketProperty::Kind bucketKind() const; +// +// bool bucketSort() const; +// +// bool scaleWriter() const; +// +// std::string toString() const; +// }; +// +// protected: +// explicit TableWriterTestBase(uint64_t testValue); +// +// void SetUp() override; +// +// static std::function addTableWriter( +// const RowTypePtr& inputColumns, +// const std::vector& tableColumnNames, +// const std::shared_ptr& aggregationNode, +// const std::shared_ptr& insertHandle, +// bool hasPartitioningScheme, +// connector::common::CommitStrategy commitStrategy = +// connector::common::CommitStrategy::kNoCommit); +// +// static RowTypePtr getNonPartitionsColumns( +// const std::vector& partitionedKeys, +// const RowTypePtr& rowType); +// +// static std::shared_ptr generateAggregationNode( +// const std::string& name, +// const std::vector& groupingKeys, +// AggregationNode::Step step, +// const PlanNodePtr& source); +// +// std::shared_ptr assertQueryWithWriterConfigs( +// const core::PlanNodePtr& plan, +// std::vector> filePaths, +// const std::string& duckDbSql, +// bool spillEnabled = false); +// +// std::shared_ptr assertQueryWithWriterConfigs( +// const core::PlanNodePtr& plan, +// const std::string& duckDbSql, +// bool enableSpill = false); +// +// RowVectorPtr runQueryWithWriterConfigs( +// const core::PlanNodePtr& plan, +// bool spillEnabled = false); +// +// void setCommitStrategy(CommitStrategy commitStrategy); +// +// void setPartitionBy(const std::vector& partitionBy); +// +// void setBucketProperty( +// HiveBucketProperty::Kind kind, +// uint32_t bucketCount, +// const std::vector& bucketedBy, +// const std::vector& bucketedTypes, +// const std::vector>& sortedBy = +// {}); +// +// void setDataTypes( +// const RowTypePtr& inputType, +// const RowTypePtr& tableSchema = nullptr); +// +// void setTableSchema(const RowTypePtr& tableSchema); +// +// std::vector> +// makeHiveConnectorSplits( +// const std::shared_ptr& directoryPath); +// +// std::vector> +// makeHiveConnectorSplits(const std::string& directoryPath); +// +// // Lists and returns all the regular files from a given directory recursively. +// std::vector listAllFiles(const std::string& directoryPath); +// +// // Builds and returns the hive splits from the list of files with one split +// // per each file. +// std::vector> +// makeHiveConnectorSplits(const std::vector& filePaths); +// +// std::vector makeVectors( +// int32_t numVectors, +// int32_t rowsPerVector); +// +// RowVectorPtr makeConstantVector(size_t size); +// +// std::vector makeBatches( +// vector_size_t numBatches, +// std::function makeVector); +// +// std::set getLeafSubdirectories(const std::string& directoryPath); +// +// std::vector getRecursiveFiles(const std::string& directoryPath); +// +// uint32_t countRecursiveFiles(const std::string& directoryPath); +// +// // Helper method to return InsertTableHandle. +// std::shared_ptr createInsertTableHandle( +// const RowTypePtr& outputRowType, +// const connector::common::LocationHandle::TableType& outputTableType, +// const std::string& outputDirectoryPath, +// const std::vector& partitionedBy, +// const std::shared_ptr bucketProperty, +// const std::optional compressionKind = {}); +// +// // Returns a table insert plan node. +// PlanNodePtr createInsertPlan( +// PlanBuilder& inputPlan, +// const RowTypePtr& outputRowType, +// const std::string& outputDirectoryPath, +// const std::vector& partitionedBy = {}, +// std::shared_ptr bucketProperty = {}, +// const std::optional compressionKind = {}, +// int numTableWriters = 1, +// const connector::common::LocationHandle::TableType& outputTableType = +// connector::common::LocationHandle::TableType::kNew, +// const connector::common::CommitStrategy& outputCommitStrategy = connector::common::CommitStrategy::kNoCommit, +// bool aggregateResult = true, +// std::shared_ptr aggregationNode = nullptr); +// +// PlanNodePtr createInsertPlan( +// PlanBuilder& inputPlan, +// const RowTypePtr& inputRowType, +// const RowTypePtr& tableRowType, +// const std::string& outputDirectoryPath, +// const std::vector& partitionedBy = {}, +// std::shared_ptr bucketProperty = {}, +// const std::optional compressionKind = {}, +// int numTableWriters = 1, +// const connector::common::LocationHandle::TableType& outputTableType = +// connector::common::LocationHandle::TableType::kNew, +// const connector::common::CommitStrategy& outputCommitStrategy = connector::common::CommitStrategy::kNoCommit, +// bool aggregateResult = true, +// std::shared_ptr aggregationNode = nullptr); +// +// PlanNodePtr createInsertPlanWithSingleWriter( +// PlanBuilder& inputPlan, +// const RowTypePtr& inputRowType, +// const RowTypePtr& tableRowType, +// const std::string& outputDirectoryPath, +// const std::vector& partitionedBy, +// std::shared_ptr bucketProperty, +// const std::optional compressionKind, +// const connector::common::LocationHandle::TableType& outputTableType, +// const connector::common::CommitStrategy& outputCommitStrategy, +// bool aggregateResult, +// std::shared_ptr aggregationNode); +// +// PlanNodePtr createInsertPlanForBucketTable( +// PlanBuilder& inputPlan, +// const RowTypePtr& inputRowType, +// const RowTypePtr& tableRowType, +// const std::string& outputDirectoryPath, +// const std::vector& partitionedBy, +// std::shared_ptr bucketProperty, +// const std::optional compressionKind, +// const connector::common::LocationHandle::TableType& outputTableType, +// const connector::common::CommitStrategy& outputCommitStrategy, +// bool aggregateResult, +// std::shared_ptr aggregationNode); +// +// // Return the corresponding column names in 'inputRowType' of +// // 'tableColumnNames' from 'tableRowType'. +// static std::vector inputColumnNames( +// const std::vector& tableColumnNames, +// const RowTypePtr& tableRowType, +// const RowTypePtr& inputRowType); +// +// PlanNodePtr createInsertPlanWithForNonBucketedTable( +// PlanBuilder& inputPlan, +// const RowTypePtr& inputRowType, +// const RowTypePtr& tableRowType, +// const std::string& outputDirectoryPath, +// const std::vector& partitionedBy, +// const std::optional compressionKind, +// const connector::common::LocationHandle::TableType& outputTableType, +// const connector::common::CommitStrategy& outputCommitStrategy, +// bool aggregateResult, +// std::shared_ptr aggregationNode); +// +// // Parameter partitionName is string formatted in the Hive style +// // key1=value1/key2=value2/... Parameter partitionTypes are types of partition +// // keys in the same order as in partitionName.The return value is a SQL +// // predicate with values single quoted for string and date and not quoted for +// // other supported types, ex., key1='value1' AND key2=value2 AND ... +// std::string partitionNameToPredicate( +// const std::string& partitionName, +// const std::vector& partitionTypes); +// +// std::string partitionNameToPredicate( +// const std::vector& partitionDirNames); +// +// // Verifies if a unbucketed file name is encoded properly based on the +// // used commit strategy. +// void verifyUnbucketedFilePath( +// const std::filesystem::path& filePath, +// const std::string& targetDir); +// +// // Verifies if a partitioned file path (directory and file name) is encoded +// // properly. +// void verifyPartitionedFilePath( +// const std::filesystem::path& filePath, +// const std::string& targetDir); +// +// // Verifies if the bucket file name is encoded properly. +// void verifyBucketedFileName(const std::filesystem::path& filePath); +// +// // Verifies if a bucketed file path (directory and file name) is encoded +// // properly. +// void verifyBucketedFilePath( +// const std::filesystem::path& filePath, +// const std::string& targetDir); +// +// // Verifies if the given partitioned table directory (names) are encoded +// // properly based on the used partitioned keys. +// void verifyPartitionedDirPath( +// const std::filesystem::path& dirPath, +// const std::string& targetDir); +// +// // Parses and returns the bucket id encoded in the bucketed file name. +// uint32_t parseBucketId(const std::string& bucketFileName); +// +// // Returns the list of partition directory names in the given directory path. +// std::vector getPartitionDirNames( +// const std::filesystem::path& dirPath); +// +// // Verifies the partitioned file data on disk by comparing with the same set +// // of data read from duckbd. +// void verifyPartitionedFilesData( +// const std::vector& filePaths, +// const std::filesystem::path& dirPath); +// +// // Gets the hash function used by the production code to build bucket id. +// std::unique_ptr getBucketFunction( +// const RowTypePtr& outputType); +// +// // Verifies the bucketed file data by checking if the bucket id of each read +// // row is the same as the one encoded in the corresponding bucketed file name. +// void verifyBucketedFileData( +// const std::filesystem::path& filePath, +// const RowTypePtr& outputFileType); +// +// // Verifies the file layout and data produced by a table writer. +// void verifyTableWriterOutput( +// const std::string& targetDir, +// const RowTypePtr& bucketCheckFileType, +// bool verifyPartitionedData = true, +// bool verifyBucketedData = true); +// +// int getNumWriters(); +// +// protected: +// static inline int kNumTableWriterCount = 4; +// static inline int kNumPartitionedTableWriterCount = 2; +// +// const TestParam testParam_; +// const FileFormat fileFormat_; +// const TestMode testMode_; +// const int numTableWriterCount_; +// const int numPartitionedTableWriterCount_; +// const std::shared_ptr planNodeIdGenerator_{ +// std::make_shared()}; +// +// RowTypePtr rowType_; +// RowTypePtr tableSchema_; +// connector::common::CommitStrategy commitStrategy_; +// std::optional compressionKind_; +// bool scaleWriter_; +// std::vector partitionedBy_; +// std::vector partitionTypes_; +// std::vector partitionChannels_; +// std::vector numPartitionKeyValues_; +// std::vector sortColumnIndices_; +// std::vector sortedFlags_; +// std::shared_ptr bucketProperty_{nullptr}; +// core::PlanNodeId tableWriteNodeId_; +//}; +// +//FOLLY_ALWAYS_INLINE std::ostream& operator<<( +// std::ostream& os, +// TableWriterTestBase::TestMode mode) { +// os << TableWriterTestBase::testModeString(mode); +// return os; +//} +//} // namespace velox::exec::test diff --git a/velox/exec/tests/utils/TestIndexStorageConnector.cpp b/velox/exec/tests/utils/TestIndexStorageConnector.cpp index 760fb57be00c..ff504a3002fc 100644 --- a/velox/exec/tests/utils/TestIndexStorageConnector.cpp +++ b/velox/exec/tests/utils/TestIndexStorageConnector.cpp @@ -33,7 +33,7 @@ core::TypedExprPtr toJoinConditionExpr( const RowTypePtr& inputType, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles) { + std::shared_ptr>& columnHandles) { if (joinConditions.empty()) { return nullptr; } @@ -102,7 +102,7 @@ TestIndexSource::TestIndexSource( size_t numEqualJoinKeys, const core::TypedExprPtr& joinConditionExpr, const std::shared_ptr& tableHandle, - connector::ConnectorQueryCtx* connectorQueryCtx, + connector::common::ConnectorQueryCtx* connectorQueryCtx, folly::Executor* executor) : tableHandle_(tableHandle), inputType_(inputType), @@ -138,7 +138,7 @@ void TestIndexSource::checkNotFailed() { } } -std::shared_ptr +std::shared_ptr TestIndexSource::lookup(const LookupRequest& request) { checkNotFailed(); const auto numInputRows = request.input->size(); @@ -250,7 +250,7 @@ TestIndexSource::ResultIterator::ResultIterator( lookupResultIter_->reset(*lookupResult_); } -std::optional> +std::optional> TestIndexSource::ResultIterator::next( vector_size_t size, ContinueFuture& future) { @@ -328,7 +328,7 @@ void TestIndexSource::ResultIterator::asyncLookup( }); } -std::unique_ptr +std::unique_ptr TestIndexSource::ResultIterator::syncLookup(vector_size_t size) { VELOX_CHECK(hasPendingRequest_); if (lookupResultIter_->atEnd()) { @@ -453,18 +453,18 @@ TestIndexConnector::TestIndexConnector( const std::string& id, std::shared_ptr /*unused*/, folly::Executor* executor) - : Connector(id), executor_(executor) {} + : connector::common::Connector(id), executor_(executor) {} -std::shared_ptr TestIndexConnector::createIndexSource( +std::shared_ptr TestIndexConnector::createIndexSource( const RowTypePtr& inputType, size_t numJoinKeys, const std::vector& joinConditions, const RowTypePtr& outputType, - const std::shared_ptr& tableHandle, + const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, - connector::ConnectorQueryCtx* connectorQueryCtx) { + std::shared_ptr>& columnHandles, + connector::common::ConnectorQueryCtx* connectorQueryCtx) { VELOX_CHECK_GE(inputType->size(), numJoinKeys + joinConditions.size()); auto testIndexTableHandle = std::dynamic_pointer_cast(tableHandle); diff --git a/velox/exec/tests/utils/TestIndexStorageConnector.h b/velox/exec/tests/utils/TestIndexStorageConnector.h index 795bc49be800..8746446a7b90 100644 --- a/velox/exec/tests/utils/TestIndexStorageConnector.h +++ b/velox/exec/tests/utils/TestIndexStorageConnector.h @@ -41,13 +41,13 @@ struct TestIndexTable { }; // The index table handle which provides the index table for index lookup. -class TestIndexTableHandle : public connector::ConnectorTableHandle { +class TestIndexTableHandle : public connector::common::ConnectorTableHandle { public: explicit TestIndexTableHandle( std::string connectorId, std::shared_ptr indexTable, bool asyncLookup) - : ConnectorTableHandle(std::move(connectorId)), + : connector::common::ConnectorTableHandle(std::move(connectorId)), indexTable_(std::move(indexTable)), asyncLookup_(asyncLookup) {} @@ -105,7 +105,7 @@ class TestIndexTableHandle : public connector::ConnectorTableHandle { const bool asyncLookup_; }; -class TestIndexSource : public connector::IndexSource, +class TestIndexSource : public connector::common::IndexSource, public std::enable_shared_from_this { public: TestIndexSource( @@ -114,7 +114,7 @@ class TestIndexSource : public connector::IndexSource, size_t numEqualJoinKeys, const core::TypedExprPtr& joinConditionExpr, const std::shared_ptr& tableHandle, - connector::ConnectorQueryCtx* connectorQueryCtx, + connector::common::ConnectorQueryCtx* connectorQueryCtx, folly::Executor* executor); std::shared_ptr lookup( @@ -236,7 +236,7 @@ class TestIndexSource : public connector::IndexSource, const RowTypePtr outputType_; const RowTypePtr keyType_; const RowTypePtr valueType_; - connector::ConnectorQueryCtx* const connectorQueryCtx_; + connector::common::ConnectorQueryCtx* const connectorQueryCtx_; const size_t numEqualJoinKeys_; const std::unique_ptr conditionExprSet_; const std::shared_ptr pool_; @@ -261,7 +261,7 @@ class TestIndexSource : public connector::IndexSource, std::unordered_map runtimeStats_; }; -class TestIndexConnector : public connector::Connector { +class TestIndexConnector : public connector::common::Connector { public: TestIndexConnector( const std::string& id, @@ -272,32 +272,32 @@ class TestIndexConnector : public connector::Connector { return true; } - std::unique_ptr createDataSource( + std::unique_ptr createDataSource( const RowTypePtr&, - const std::shared_ptr&, + const std::shared_ptr&, const std::unordered_map< std::string, - std::shared_ptr>&, - connector::ConnectorQueryCtx*) override { + std::shared_ptr>&, + connector::common::ConnectorQueryCtx*) override { VELOX_UNSUPPORTED("{} not implemented", __FUNCTION__); } - std::shared_ptr createIndexSource( + std::shared_ptr createIndexSource( const RowTypePtr& inputType, size_t numJoinKeys, const std::vector& joinConditions, const RowTypePtr& outputType, - const std::shared_ptr& tableHandle, + const std::shared_ptr& tableHandle, const std::unordered_map< std::string, - std::shared_ptr>& columnHandles, - connector::ConnectorQueryCtx* connectorQueryCtx) override; + std::shared_ptr>& columnHandles, + connector::common::ConnectorQueryCtx* connectorQueryCtx) override; - std::unique_ptr createDataSink( + std::unique_ptr createDataSink( RowTypePtr, - std::shared_ptr, - connector::ConnectorQueryCtx*, - connector::CommitStrategy) override { + std::shared_ptr, + connector::common::ConnectorQueryCtx*, + connector::common::CommitStrategy) override { VELOX_UNSUPPORTED("{} not implemented", __FUNCTION__); } @@ -305,12 +305,12 @@ class TestIndexConnector : public connector::Connector { folly::Executor* const executor_; }; -class TestIndexConnectorFactory : public connector::ConnectorFactory { +class TestIndexConnectorFactory : public connector::common::ConnectorFactory { public: TestIndexConnectorFactory() - : ConnectorFactory(kTestIndexConnectorName.c_str()) {} + : connector::common::ConnectorFactory(kTestIndexConnectorName.c_str()) {} - std::shared_ptr newConnector( + std::shared_ptr newConnector( const std::string& id, std::shared_ptr properties, folly::Executor* /*unused*/, diff --git a/velox/experimental/wave/dwio/ColumnReader.h b/velox/experimental/wave/dwio/ColumnReader.h index 6d6f3f3660fb..c8d527257076 100644 --- a/velox/experimental/wave/dwio/ColumnReader.h +++ b/velox/experimental/wave/dwio/ColumnReader.h @@ -46,7 +46,7 @@ class ColumnReader { virtual ~ColumnReader() = default; - const common::ScanSpec& scanSpec() const { + const velox::common::ScanSpec& scanSpec() const { return *scanSpec_; } diff --git a/velox/experimental/wave/dwio/FormatData.cpp b/velox/experimental/wave/dwio/FormatData.cpp index 2e849c204013..2eb94c043593 100644 --- a/velox/experimental/wave/dwio/FormatData.cpp +++ b/velox/experimental/wave/dwio/FormatData.cpp @@ -172,7 +172,7 @@ void setFilter(GpuDecode* step, ColumnReader* reader, Stream* stream) { return; } switch (veloxFilter->kind()) { - case common::FilterKind::kBigintRange: { + case velox::common::FilterKind::kBigintRange: { step->filterKind = WaveFilterKind::kBigintRange; step->nullsAllowed = veloxFilter->testNull(); step->filter._.int64Range[0] = diff --git a/velox/experimental/wave/dwio/FormatData.h b/velox/experimental/wave/dwio/FormatData.h index 8800f596ff02..923d8bc7cf53 100644 --- a/velox/experimental/wave/dwio/FormatData.h +++ b/velox/experimental/wave/dwio/FormatData.h @@ -38,7 +38,7 @@ using StagingSet = OperandSet; // Describes how a column is staged on GPU, for example, copy from host RAM, // direct read, already on device etc. struct Staging { - Staging(const void* hostData, int32_t size, const common::Region& region) + Staging(const void* hostData, int32_t size, const velox::common::Region& region) : hostData(hostData), size(hostData ? size : region.length), fileOffset(region.offset) {} diff --git a/velox/experimental/wave/exec/TableScan.cpp b/velox/experimental/wave/exec/TableScan.cpp index 6a988bf02cf4..be2fe49dc7ba 100644 --- a/velox/experimental/wave/exec/TableScan.cpp +++ b/velox/experimental/wave/exec/TableScan.cpp @@ -157,13 +157,13 @@ BlockingReason TableScan::nextSplit(ContinueFuture* future) { return BlockingReason::kNotBlocked; } -void TableScan::preload(std::shared_ptr split) { +void TableScan::preload(std::shared_ptr split) { // The AsyncSource returns a unique_ptr to the shared_ptr of the // DataSource. The callback may outlive the Task, hence it captures // a shared_ptr to it. This is required to keep memory pools live // for the duration. The callback checks for task cancellation to // avoid needless work. - split->dataSource = std::make_unique>( + split->dataSource = std::make_unique>( [type = outputType_, source = waveDataSource_, table = tableHandle_, @@ -174,7 +174,7 @@ void TableScan::preload(std::shared_ptr split) { ctx = driver_->operatorCtx()->createConnectorQueryCtx( split->connectorId, planNodeId_, connectorPool_), task = driver_->operatorCtx()->task(), - split]() -> std::unique_ptr { + split]() -> std::unique_ptr { if (task->isCancelled()) { return nullptr; } @@ -209,7 +209,7 @@ void TableScan::checkPreload() { maxSplitPreloadPerDriver_; if (!splitPreloader_) { splitPreloader_ = - [executor, this](std::shared_ptr split) { + [executor, this](std::shared_ptr split) { preload(split); executor->add( @@ -229,7 +229,7 @@ bool TableScan::isFinished() const { void TableScan::addDynamicFilter( const core::PlanNodeId& producer, column_index_t outputChannel, - const std::shared_ptr& filter) { + const std::shared_ptr& filter) { if (dataSource_) { dataSource_->addDynamicFilter(outputChannel, filter); } else { diff --git a/velox/experimental/wave/exec/TableScan.h b/velox/experimental/wave/exec/TableScan.h index dc935bcae71f..5740b2f4bda0 100644 --- a/velox/experimental/wave/exec/TableScan.h +++ b/velox/experimental/wave/exec/TableScan.h @@ -49,7 +49,7 @@ class TableScan : public WaveSourceOperator { ->queryConfig() .preferredOutputBatchRows()) { defines_ = std::move(defines); - connector_ = connector::getConnector(tableHandle_->connectorId()); + connector_ = connector::common::getConnector(tableHandle_->connectorId()); } std::vector canAdvance(WaveStream& stream) override; @@ -72,7 +72,7 @@ class TableScan : public WaveSourceOperator { void addDynamicFilter( const core::PlanNodeId& producer, column_index_t outputChannel, - const std::shared_ptr& filter) override; + const std::shared_ptr& filter) override; static uint64_t ioWaitNanos() { return ioWaitNanos_; @@ -95,7 +95,7 @@ class TableScan : public WaveSourceOperator { // DataSource to read 'split'. This source will be prepared in the // background on the executor of the connector. If the DataSource is // needed before prepare is done, it will be made when needed. - void preload(std::shared_ptr split); + void preload(std::shared_ptr split); // Adds 'stats' to operator stats of the containing WaveDriver. Some // stats come from DataSource, others from SplitReader. If @@ -109,24 +109,24 @@ class TableScan : public WaveSourceOperator { // Process-wide IO wait time. static std::atomic ioWaitNanos_; - const std::shared_ptr tableHandle_; + const std::shared_ptr tableHandle_; const std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> columnHandles_; exec::DriverCtx* const driverCtx_; memory::MemoryPool* const connectorPool_; ContinueFuture blockingFuture_{ContinueFuture::makeEmpty()}; exec::BlockingReason blockingReason_; bool needNewSplit_ = true; - std::shared_ptr connector_; - std::shared_ptr connectorQueryCtx_; + std::shared_ptr connector_; + std::shared_ptr connectorQueryCtx_; bool noMoreSplits_ = false; // Dynamic filters to add to the data source when it gets created. - std::unordered_map> + std::unordered_map> pendingDynamicFilters_; - std::shared_ptr dataSource_; + std::shared_ptr dataSource_; std::shared_ptr waveDataSource_; @@ -139,7 +139,7 @@ class TableScan : public WaveSourceOperator { // callback can schedule preloads on an executor. These preloads may // outlive the Task and therefore need to capture a shared_ptr to // it. - std::function&)> + std::function&)> splitPreloader_{nullptr}; // Count of splits that started background preload. diff --git a/velox/experimental/wave/exec/ToWave.cpp b/velox/experimental/wave/exec/ToWave.cpp index acbcf51613d1..066dcf0846bc 100644 --- a/velox/experimental/wave/exec/ToWave.cpp +++ b/velox/experimental/wave/exec/ToWave.cpp @@ -60,7 +60,7 @@ common::Subfield* CompileState::toSubfield(const std::string& name) { VELOX_CHECK(!namesResolved_); auto it = subfields_.find(name); if (it == subfields_.end()) { - auto field = std::make_unique(name); + auto field = std::make_unique(name); auto result = field.get(); subfields_[name] = std::move(field); return result; diff --git a/velox/experimental/wave/exec/ToWave.h b/velox/experimental/wave/exec/ToWave.h index d17820634e58..8f430a2e72a9 100644 --- a/velox/experimental/wave/exec/ToWave.h +++ b/velox/experimental/wave/exec/ToWave.h @@ -26,7 +26,7 @@ namespace facebook::velox::wave { using SubfieldMap = - folly::F14FastMap>; + folly::F14FastMap>; /// Branch targets when generating device code. struct Branches { @@ -835,7 +835,7 @@ struct Segment { // If this projects out columns, these are the column names, 1:1 to // topLevelDefined. - std::vector projectedName; + std::vector projectedName; // intermediates that are unconditionally computed and could be referenced // from subsequent places for optimization, e.g. dedupping. Does not include @@ -867,9 +867,9 @@ class CompileState { // Wave equivalents. Returns true if the Driver was changed. bool compile(); - common::Subfield* toSubfield(const exec::Expr& expr); + velox::common::Subfield* toSubfield(const exec::Expr& expr); - common::Subfield* toSubfield(const std::string& name); + velox::common::Subfield* toSubfield(const std::string& name); AbstractOperand* newOperand(AbstractOperand& other); @@ -968,7 +968,7 @@ class CompileState { return &topScope_; } - AbstractOperand* fieldToOperand(common::Subfield& field, Scope* scope); + AbstractOperand* fieldToOperand(velox::common::Subfield& field, Scope* scope); FunctionMetadata functionReferenced(const AbstractOperand* op); diff --git a/velox/experimental/wave/exec/Wave.cpp b/velox/experimental/wave/exec/Wave.cpp index 5c2a6d0b72e9..54fb4da43ec0 100644 --- a/velox/experimental/wave/exec/Wave.cpp +++ b/velox/experimental/wave/exec/Wave.cpp @@ -118,11 +118,11 @@ void OperatorStateMap::addIfNew( AbstractOperand* pathToOperand( const DefinesMap& map, - std::vector>& path) { + std::vector>& path) { if (path.empty()) { return nullptr; } - common::Subfield field(std::move(path)); + velox::common::Subfield field(std::move(path)); const auto subfieldMap = threadSubfieldMap(); auto it = threadSubfieldMap()->find(field.toString()); if (it == subfieldMap->end()) { diff --git a/velox/experimental/wave/exec/Wave.h b/velox/experimental/wave/exec/Wave.h index ab0a8ae332e7..9a61e2efc023 100644 --- a/velox/experimental/wave/exec/Wave.h +++ b/velox/experimental/wave/exec/Wave.h @@ -160,7 +160,7 @@ struct Value { Value() = default; Value(const exec::Expr* expr) : expr(expr), subfield(nullptr) {} - Value(const common::Subfield* subfield) : expr(nullptr), subfield(subfield) {} + Value(const velox::common::Subfield* subfield) : expr(nullptr), subfield(subfield) {} ~Value() = default; bool operator==(const Value& other) const { @@ -171,7 +171,7 @@ struct Value { std::string toString() const; const exec::Expr* expr; - const common::Subfield* subfield; + const velox::common::Subfield* subfield; }; struct ValueHasher { @@ -190,7 +190,7 @@ struct ValueComparer { }; using SubfieldMap = - folly::F14FastMap>; + folly::F14FastMap>; using DefinesMap = folly::F14FastMap; @@ -200,7 +200,7 @@ using DefinesMap = /// moved into a Subfield. Not thread safe for 'path'. AbstractOperand* pathToOperand( const DefinesMap& map, - std::vector>& path); + std::vector>& path); const SubfieldMap*& threadSubfieldMap(); diff --git a/velox/experimental/wave/exec/WaveDataSource.h b/velox/experimental/wave/exec/WaveDataSource.h index caf7c28f2504..69dd3dd94dac 100644 --- a/velox/experimental/wave/exec/WaveDataSource.h +++ b/velox/experimental/wave/exec/WaveDataSource.h @@ -16,8 +16,8 @@ #pragma once +#include "../../../connectors/common/Connector.h" #include "velox/common/time/Timer.h" -#include "velox/connectors/Connector.h" #include "velox/exec/Task.h" #include "velox/experimental/wave/exec/WaveOperator.h" #include "velox/expression/Expr.h" @@ -26,7 +26,7 @@ namespace facebook::velox::wave { class WaveSplitReader; -/// A delegate produced by a regular Velox connector::DataSource for reading its +/// A delegate produced by a regular Velox connector::common::DataSource for reading its /// particular file format on GPU. Same methods, except that Wave schedule() and /// related are exposed instead of an iterator model. class WaveDataSource : public std::enable_shared_from_this { @@ -40,9 +40,9 @@ class WaveDataSource : public std::enable_shared_from_this { virtual void addDynamicFilter( column_index_t outputChannel, - const std::shared_ptr& filter) = 0; + const std::shared_ptr& filter) = 0; - virtual void addSplit(std::shared_ptr split) = 0; + virtual void addSplit(std::shared_ptr split) = 0; virtual int32_t canAdvance(WaveStream& stream) = 0; diff --git a/velox/experimental/wave/exec/WaveDriver.cpp b/velox/experimental/wave/exec/WaveDriver.cpp index c6f97ecb6899..a3e9d01b1536 100644 --- a/velox/experimental/wave/exec/WaveDriver.cpp +++ b/velox/experimental/wave/exec/WaveDriver.cpp @@ -628,7 +628,7 @@ Advance WaveDriver::advance(int pipelineIdx) { blockingReason_ = exec::BlockingReason::kYield; blockingFuture_ = ContinueFuture{folly::Unit{}}; // A point for test code injection. - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::wave::WaveDriver::getOutput::yield", this); totalWaitLoops += waitLoops; waveStats_.waitTime.micros += waitUs; diff --git a/velox/experimental/wave/exec/WaveDriver.h b/velox/experimental/wave/exec/WaveDriver.h index a0072a64bbe7..be98a8a2be5c 100644 --- a/velox/experimental/wave/exec/WaveDriver.h +++ b/velox/experimental/wave/exec/WaveDriver.h @@ -213,7 +213,7 @@ class WaveDriver : public exec::SourceOperator { void addDynamicFilter( const core::PlanNodeId& producer, column_index_t outputChannel, - const std::shared_ptr& filter) override { + const std::shared_ptr& filter) override { pipelines_[0].operators[0]->addDynamicFilter( producer, outputChannel, filter); } diff --git a/velox/experimental/wave/exec/WaveHiveDataSource.cpp b/velox/experimental/wave/exec/WaveHiveDataSource.cpp index 1b372008303c..7bb281922783 100644 --- a/velox/experimental/wave/exec/WaveHiveDataSource.cpp +++ b/velox/experimental/wave/exec/WaveHiveDataSource.cpp @@ -22,17 +22,17 @@ using namespace connector::hive; WaveHiveDataSource::WaveHiveDataSource( const std::shared_ptr& hiveTableHandle, - const std::shared_ptr& scanSpec, + const std::shared_ptr& scanSpec, const RowTypePtr& readerOutputType, std::unordered_map>* partitionKeys, FileHandleFactory* fileHandleFactory, folly::Executor* executor, - const connector::ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, const std::shared_ptr& hiveConfig, const std::shared_ptr& ioStats, const exec::ExprSet* remainingFilter, - std::shared_ptr metadataFilter) { + std::shared_ptr metadataFilter) { params_.hiveTableHandle = hiveTableHandle; params_.scanSpec = scanSpec; params_.readerOutputType = readerOutputType; @@ -48,7 +48,7 @@ WaveHiveDataSource::WaveHiveDataSource( void WaveHiveDataSource::addDynamicFilter( column_index_t outputChannel, - const std::shared_ptr& filter) { + const std::shared_ptr& filter) { VELOX_NYI(); } @@ -74,7 +74,7 @@ void WaveHiveDataSource::setFromDataSource( } void WaveHiveDataSource::addSplit( - std::shared_ptr split) { + std::shared_ptr split) { VELOX_CHECK( split_ == nullptr, "Previous split has not been processed yet. Call next to process the split."); @@ -162,23 +162,23 @@ void WaveHiveDataSource::registerConnector() { // Create hive connector with config... auto hiveConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector("wavemock", config, nullptr); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); connector::hive::HiveDataSource::registerWaveDelegateHook( [](const std::shared_ptr& hiveTableHandle, - const std::shared_ptr& scanSpec, + const std::shared_ptr& scanSpec, const RowTypePtr& readerOutputType, std::unordered_map>* partitionKeys, FileHandleFactory* fileHandleFactory, folly::Executor* executor, - const connector::ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, const std::shared_ptr& hiveConfig, const std::shared_ptr& ioStats, const exec::ExprSet* remainingFilter, - std::shared_ptr metadataFilter) { + std::shared_ptr metadataFilter) { return std::make_shared( hiveTableHandle, scanSpec, diff --git a/velox/experimental/wave/exec/WaveHiveDataSource.h b/velox/experimental/wave/exec/WaveHiveDataSource.h index 7e1336ab4f57..2d3d8a6e6007 100644 --- a/velox/experimental/wave/exec/WaveHiveDataSource.h +++ b/velox/experimental/wave/exec/WaveHiveDataSource.h @@ -25,24 +25,24 @@ class WaveHiveDataSource : public WaveDataSource { public: WaveHiveDataSource( const std::shared_ptr& hiveTableHandle, - const std::shared_ptr& scanSpec, + const std::shared_ptr& scanSpec, const RowTypePtr& readerOutputType, std::unordered_map< std::string, std::shared_ptr>* partitionKeys, FileHandleFactory* fileHandleFactory, folly::Executor* executor, - const connector::ConnectorQueryCtx* connectorQueryCtx, + const connector::common::ConnectorQueryCtx* connectorQueryCtx, const std::shared_ptr& hiveConfig, const std::shared_ptr& ioStats, const exec::ExprSet* remainingFilter, - std::shared_ptr metadataFilter); + std::shared_ptr metadataFilter); void addDynamicFilter( column_index_t outputChannel, - const std::shared_ptr& filter) override; + const std::shared_ptr& filter) override; - void addSplit(std::shared_ptr split) override; + void addSplit(std::shared_ptr split) override; void setFromDataSource(std::shared_ptr dataSource) override; @@ -66,11 +66,11 @@ class WaveHiveDataSource : public WaveDataSource { private: SplitReaderParams params_; - std::shared_ptr split_; + std::shared_ptr split_; std::shared_ptr splitReader_; std::shared_ptr remainingFilter_; dwio::common::RuntimeStatistics runtimeStats_; - std::shared_ptr metadataFilter_; + std::shared_ptr metadataFilter_; int64_t completedRows_{0}; int64_t completedBytes_{0}; std::unordered_map splitReaderStats_; diff --git a/velox/experimental/wave/exec/WaveOperator.h b/velox/experimental/wave/exec/WaveOperator.h index f2ec2305cb30..bea1e8ed3413 100644 --- a/velox/experimental/wave/exec/WaveOperator.h +++ b/velox/experimental/wave/exec/WaveOperator.h @@ -135,7 +135,7 @@ class WaveOperator { } void addSubfieldAndType( - const common::Subfield* subfield, + const velox::common::Subfield* subfield, const TypePtr& type) { VELOX_UNSUPPORTED(); // subfields_.push_back(subfield); @@ -176,7 +176,7 @@ class WaveOperator { virtual void addDynamicFilter( const core::PlanNodeId& /*producer*/, column_index_t /*outputChannel*/, - const std::shared_ptr& /*filter*/) { + const std::shared_ptr& /*filter*/) { VELOX_UNSUPPORTED(); } @@ -193,7 +193,7 @@ class WaveOperator { // different times on different waves. In this list, ordered in // depth first preorder of outputType_. Top struct not listed, // struct columns have the parent before the children. - // std::vector subfields_; + // std::vector subfields_; // Pairwise type for each subfield. // std::vector types_; diff --git a/velox/experimental/wave/exec/WavePlan.cpp b/velox/experimental/wave/exec/WavePlan.cpp index ff53baa46bc5..292617f0f6db 100644 --- a/velox/experimental/wave/exec/WavePlan.cpp +++ b/velox/experimental/wave/exec/WavePlan.cpp @@ -31,7 +31,7 @@ DEFINE_int32(st_cost, 40, "Cost of store to memory"); namespace facebook::velox::wave { -using common::Subfield; +using velox::common::Subfield; using exec::Expr; std::string CodePosition::toString() const { @@ -135,7 +135,7 @@ AbstractOperand* CompileState::fieldToOperand(Subfield& field, Scope* scope) { return markUse(op); } auto* name = - &reinterpret_cast(field.path()[0].get()) + &reinterpret_cast(field.path()[0].get()) ->name(); VELOX_CHECK_EQ(topScopes_.size(), renames_.size()); for (int32_t i = renames_.size() - 1; i >= 0; --i) { diff --git a/velox/experimental/wave/exec/WaveSplitReader.cpp b/velox/experimental/wave/exec/WaveSplitReader.cpp index 330027b49fb0..bbd8624060f1 100644 --- a/velox/experimental/wave/exec/WaveSplitReader.cpp +++ b/velox/experimental/wave/exec/WaveSplitReader.cpp @@ -18,7 +18,7 @@ namespace facebook::velox::wave { std::shared_ptr WaveSplitReader::create( - const std::shared_ptr& split, + const std::shared_ptr& split, const SplitReaderParams& params, const DefinesMap* defines) { for (auto& factory : factories_) { diff --git a/velox/experimental/wave/exec/WaveSplitReader.h b/velox/experimental/wave/exec/WaveSplitReader.h index 3bd4e6a7cc54..7b94a2ebc727 100644 --- a/velox/experimental/wave/exec/WaveSplitReader.h +++ b/velox/experimental/wave/exec/WaveSplitReader.h @@ -31,14 +31,14 @@ namespace facebook::velox::wave { /// Parameters for a Wave Hive SplitReaderFactory. struct SplitReaderParams { std ::shared_ptr hiveTableHandle; - std::shared_ptr scanSpec; + std::shared_ptr scanSpec; RowTypePtr readerOutputType; std::unordered_map< std::string, std::shared_ptr>* partitionKeys; FileHandleFactory* fileHandleFactory; folly::Executor* executor; - const connector::ConnectorQueryCtx* connectorQueryCtx; + const connector::common::ConnectorQueryCtx* connectorQueryCtx; std::shared_ptr hiveConfig; std::shared_ptr ioStats; }; @@ -50,7 +50,7 @@ class WaveSplitReader { virtual ~WaveSplitReader() = default; static std::shared_ptr create( - const std::shared_ptr& split, + const std::shared_ptr& split, const SplitReaderParams& params, const DefinesMap* defines); @@ -70,7 +70,7 @@ class WaveSplitReader { virtual void configureReaderOptions() {} virtual void prepareSplit( - std::shared_ptr metadataFilter, + std::shared_ptr metadataFilter, dwio::common::RuntimeStatistics& runtimeStats) {} static void registerFactory(std::unique_ptr factory); @@ -85,7 +85,7 @@ class WaveSplitReaderFactory { /// Returns a new split reader corresponding to 'split' if 'this' recognizes /// the split, otherwise returns nullptr. virtual std::shared_ptr create( - const std::shared_ptr& split, + const std::shared_ptr& split, const SplitReaderParams& params, const DefinesMap* defines) = 0; }; diff --git a/velox/experimental/wave/exec/tests/WaveBenchmark.cpp b/velox/experimental/wave/exec/tests/WaveBenchmark.cpp index d4ac1f136eeb..ae459c4b8d47 100644 --- a/velox/experimental/wave/exec/tests/WaveBenchmark.cpp +++ b/velox/experimental/wave/exec/tests/WaveBenchmark.cpp @@ -200,7 +200,7 @@ class WaveBenchmark : public QueryBenchmarkBase { rootPool_->addAggregateChild("HiveConnectorTestBase.Writer"); if (FLAGS_data_format == "dwrf") { auto config = std::make_shared(); - config->set(dwrf::Config::COMPRESSION, common::CompressionKind_NONE); + config->set(dwrf::Config::COMPRESSION, velox::common::CompressionKind_NONE); config->set( dwrf::Config::STRIPE_SIZE, static_cast(FLAGS_rows_per_stripe * FLAGS_num_columns * 8)); @@ -225,7 +225,7 @@ class WaveBenchmark : public QueryBenchmarkBase { return std::make_unique( 1000000, 1000000000, [&]() { return (++flushCounter % 1 == 0); }); }; - options.compressionKind = common::CompressionKind_NONE; + options.compressionKind = velox::common::CompressionKind_NONE; auto writer = std::make_unique( std::move(sink), options, asRowType(schema)); for (auto& batch : vectors) { @@ -378,7 +378,7 @@ class WaveBenchmark : public QueryBenchmarkBase { } } - std::vector> listSplits( + std::vector> listSplits( const std::string& path, int32_t numSplitsPerFile, const TpchPlan& plan) override { diff --git a/velox/experimental/wave/exec/tests/utils/FileFormat.h b/velox/experimental/wave/exec/tests/utils/FileFormat.h index 58e837d6beec..e41f258637cd 100644 --- a/velox/experimental/wave/exec/tests/utils/FileFormat.h +++ b/velox/experimental/wave/exec/tests/utils/FileFormat.h @@ -15,8 +15,8 @@ */ #pragma once +#include "../../../../../connectors/common/Connector.h" #include "velox/common/file/Region.h" -#include "velox/connectors/Connector.h" #include "velox/connectors/hive/HiveConnectorSplit.h" #include "velox/dwio/common/TypeWithId.h" #include "velox/type/StringView.h" @@ -60,7 +60,7 @@ struct Column { std::unique_ptr nulls; /// Location of raw data in the backing file, or 0,0 if no backing file. - common::Region region; + velox::common::Region region; std::vector> children; }; @@ -218,7 +218,7 @@ class Writer { std::vector> encoders_; }; -using SplitVector = std::vector>; +using SplitVector = std::vector>; class Table { public: diff --git a/velox/experimental/wave/exec/tests/utils/TestFormatReader.cpp b/velox/experimental/wave/exec/tests/utils/TestFormatReader.cpp index e6032a27e413..cf6d32ffbcd4 100644 --- a/velox/experimental/wave/exec/tests/utils/TestFormatReader.cpp +++ b/velox/experimental/wave/exec/tests/utils/TestFormatReader.cpp @@ -21,7 +21,7 @@ DECLARE_int32(wave_reader_rows_per_tb); namespace facebook::velox::wave::test { -using common::Subfield; +using velox::common::Subfield; std::unique_ptr TestFormatParams::toFormatData( const std::shared_ptr& type, @@ -262,7 +262,7 @@ class TestStructColumnReader : public StructColumnReader { const TypePtr& requestedType, const std::shared_ptr& fileType, TestFormatParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, std::vector>& path, const DefinesMap& defines, bool isRoot) @@ -289,7 +289,7 @@ class TestStructColumnReader : public StructColumnReader { auto childParams = TestFormatParams( params.pool(), params.runtimeStatistics(), params.stripe()); - path.push_back(std::make_unique( + path.push_back(std::make_unique( childSpec->fieldName())); addChild(TestFormatReader::build( childRequestedType, @@ -308,7 +308,7 @@ std::unique_ptr buildIntegerReader( const TypePtr& requestedType, const std::shared_ptr& fileType, TestFormatParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, std::vector>& path, const DefinesMap& defines) { return std::make_unique( @@ -320,7 +320,7 @@ std::unique_ptr TestFormatReader::build( const TypePtr& requestedType, const std::shared_ptr& fileType, TestFormatParams& params, - common::ScanSpec& scanSpec, + velox::common::ScanSpec& scanSpec, std::vector>& path, const DefinesMap& defines, bool isRoot) { diff --git a/velox/experimental/wave/exec/tests/utils/TestFormatReader.h b/velox/experimental/wave/exec/tests/utils/TestFormatReader.h index f04e9df19780..a62c87f93565 100644 --- a/velox/experimental/wave/exec/tests/utils/TestFormatReader.h +++ b/velox/experimental/wave/exec/tests/utils/TestFormatReader.h @@ -126,8 +126,8 @@ class TestFormatReader { const TypePtr& requestedType, const std::shared_ptr& fileType, TestFormatParams& params, - common::ScanSpec& scanSpec, - std::vector>& path, + velox::common::ScanSpec& scanSpec, + std::vector>& path, const DefinesMap& defines, bool isRoot = false); }; diff --git a/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.cpp b/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.cpp index bdbaab479d75..303b28c1dc19 100644 --- a/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.cpp +++ b/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.cpp @@ -21,10 +21,10 @@ DECLARE_int32(wave_max_reader_batch_rows); namespace facebook::velox::wave::test { -using common::Subfield; +using velox::common::Subfield; WaveTestSplitReader::WaveTestSplitReader( - const std::shared_ptr& split, + const std::shared_ptr& split, const SplitReaderParams& params, const DefinesMap* defines) { params_ = params; @@ -113,7 +113,7 @@ namespace { class WaveTestSplitReaderFactory : public WaveSplitReaderFactory { public: std::shared_ptr create( - const std::shared_ptr& split, + const std::shared_ptr& split, const SplitReaderParams& params, const DefinesMap* defines) override { auto hiveSplit = diff --git a/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h b/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h index 320c8a08784e..f8c6038e636a 100644 --- a/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h +++ b/velox/experimental/wave/exec/tests/utils/WaveTestSplitReader.h @@ -26,7 +26,7 @@ namespace facebook::velox::wave::test { class WaveTestSplitReader : public WaveSplitReader { public: WaveTestSplitReader( - const std::shared_ptr& split, + const std::shared_ptr& split, const SplitReaderParams& params, const DefinesMap* defines); @@ -61,7 +61,7 @@ class WaveTestSplitReader : public WaveSplitReader { return stripe_->columns[0]->numValues - nextRow_; } - std::shared_ptr split_; + std::shared_ptr split_; SplitReaderParams params_; FileHandleCachedPtr fileHandleCachePtr; cache::AsyncDataCache* cache_{nullptr}; diff --git a/velox/expression/Expr.cpp b/velox/expression/Expr.cpp index cdbaea9e24c2..a9f596726dd1 100644 --- a/velox/expression/Expr.cpp +++ b/velox/expression/Expr.cpp @@ -546,7 +546,7 @@ class ExprExceptionContext { // Persist vector to disk try { - auto dataPathOpt = common::generateTempFilePath(basePath, "vector"); + auto dataPathOpt = velox::common::generateTempFilePath(basePath, "vector"); if (!dataPathOpt.has_value()) { dataPath_ = "Failed to create file for saving input vector."; return; @@ -561,7 +561,7 @@ class ExprExceptionContext { // Persist sql to disk auto sql = expr_->toSql(); try { - auto sqlPathOpt = common::generateTempFilePath(basePath, "sql"); + auto sqlPathOpt = velox::common::generateTempFilePath(basePath, "sql"); if (!sqlPathOpt.has_value()) { sqlPath_ = "Failed to create file for saving SQL."; return; @@ -583,7 +583,7 @@ class ExprExceptionContext { allSql << exprs[i]->toSql(); } try { - auto sqlPathOpt = common::generateTempFilePath(basePath, "allExprSql"); + auto sqlPathOpt = velox::common::generateTempFilePath(basePath, "allExprSql"); if (!sqlPathOpt.has_value()) { allExprSqlPath_ = "Failed to create file for saving all SQL expressions."; @@ -1035,7 +1035,7 @@ Expr::PeelEncodingsResult Expr::peelEncodings( !peeledVectors[0]->memoDisabled(); } - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::exec::Expr::peelEncodings::mayCache", &mayCache); return {newRows, finalRowsHolder.get(), mayCache}; } @@ -1673,7 +1673,7 @@ namespace { common::Subfield extractSubfield( const Expr* expr, const folly::F14FastMap& shadowedNames) { - std::vector> path; + std::vector> path; for (;;) { if (auto* ref = expr->as()) { const auto& name = ref->name(); @@ -1683,7 +1683,7 @@ common::Subfield extractSubfield( expr = expr->inputs()[0].get(); continue; } - path.push_back(std::make_unique(name)); + path.push_back(std::make_unique(name)); if (!ref->inputs().empty()) { expr = ref->inputs()[0].get(); continue; @@ -1692,7 +1692,7 @@ common::Subfield extractSubfield( return {}; } std::reverse(path.begin(), path.end()); - return common::Subfield(std::move(path)); + return velox::common::Subfield(std::move(path)); } if (!expr->vectorFunction()) { return {}; @@ -1708,23 +1708,23 @@ common::Subfield extractSubfield( } switch (index->value()->typeKind()) { case TypeKind::TINYINT: - path.push_back(std::make_unique( + path.push_back(std::make_unique( index->value()->as>()->value())); break; case TypeKind::SMALLINT: - path.push_back(std::make_unique( + path.push_back(std::make_unique( index->value()->as>()->value())); break; case TypeKind::INTEGER: - path.push_back(std::make_unique( + path.push_back(std::make_unique( index->value()->as>()->value())); break; case TypeKind::BIGINT: - path.push_back(std::make_unique( + path.push_back(std::make_unique( index->value()->as>()->value())); break; case TypeKind::VARCHAR: - path.push_back(std::make_unique( + path.push_back(std::make_unique( index->value()->as>()->value())); break; default: @@ -1738,7 +1738,7 @@ common::Subfield extractSubfield( void Expr::extractSubfieldsImpl( folly::F14FastMap* shadowedNames, - std::vector* subfields) const { + std::vector* subfields) const { auto subfield = extractSubfield(this, *shadowedNames); if (subfield.valid()) { subfields->push_back(std::move(subfield)); @@ -1749,9 +1749,9 @@ void Expr::extractSubfieldsImpl( } } -std::vector Expr::extractSubfields() const { +std::vector Expr::extractSubfields() const { folly::F14FastMap shadowedNames; - std::vector subfields; + std::vector subfields; extractSubfieldsImpl(&shadowedNames, &subfields); return subfields; } @@ -1865,7 +1865,7 @@ void printInputAndExprs( } // Persist vector to disk try { - auto dataPathOpt = common::generateTempFilePath(basePath, "vector"); + auto dataPathOpt = velox::common::generateTempFilePath(basePath, "vector"); if (!dataPathOpt.has_value()) { return; } @@ -1883,7 +1883,7 @@ void printInputAndExprs( } allSql << exprs[i]->toSql(); } - auto sqlPathOpt = common::generateTempFilePath(basePath, "allExprSql"); + auto sqlPathOpt = velox::common::generateTempFilePath(basePath, "allExprSql"); if (!sqlPathOpt.has_value()) { return; } diff --git a/velox/expression/Expr.h b/velox/expression/Expr.h index 17ec0377ae60..f28cf9774d2e 100644 --- a/velox/expression/Expr.h +++ b/velox/expression/Expr.h @@ -318,11 +318,11 @@ class Expr { return false; } - std::vector extractSubfields() const; + std::vector extractSubfields() const; virtual void extractSubfieldsImpl( folly::F14FastMap* shadowedNames, - std::vector* subfields) const; + std::vector* subfields) const; template const T* as() const { diff --git a/velox/expression/ExprToSubfieldFilter.cpp b/velox/expression/ExprToSubfieldFilter.cpp index 54a11e065e43..b87ea5e4d402 100644 --- a/velox/expression/ExprToSubfieldFilter.cpp +++ b/velox/expression/ExprToSubfieldFilter.cpp @@ -54,12 +54,12 @@ const core::CallTypedExpr* asCall(const core::ITypedExpr* expr) { return dynamic_cast(expr); } -common::BigintRange* asBigintRange(std::unique_ptr& filter) { +common::BigintRange* asBigintRange(std::unique_ptr& filter) { return dynamic_cast(filter.get()); } common::BigintMultiRange* asBigintMultiRange( - std::unique_ptr& filter) { + std::unique_ptr& filter) { return dynamic_cast(filter.get()); } @@ -68,9 +68,9 @@ std::unique_ptr asUniquePtr(std::unique_ptr ptr) { return std::unique_ptr(static_cast(ptr.release())); } -std::unique_ptr makeOrFilter( - std::unique_ptr a, - std::unique_ptr b) { +std::unique_ptr makeOrFilter( + std::unique_ptr a, + std::unique_ptr b) { if (asBigintRange(a) && asBigintRange(b)) { return bigintOr( asUniquePtr(std::move(a)), @@ -122,13 +122,13 @@ std::function()> bool ExprToSubfieldFilterParser::toSubfield( const core::ITypedExpr* field, - common::Subfield& subfield) { - std::vector> path; + velox::common::Subfield& subfield) { + std::vector> path; for (auto* current = field;;) { if (auto* fieldAccess = dynamic_cast(current)) { path.push_back( - std::make_unique(fieldAccess->name())); + std::make_unique(fieldAccess->name())); } else if ( auto* dereference = dynamic_cast(current)) { @@ -138,7 +138,7 @@ bool ExprToSubfieldFilterParser::toSubfield( if (name.empty()) { return false; } - path.push_back(std::make_unique(name)); + path.push_back(std::make_unique(name)); } else if (dynamic_cast(current) == nullptr) { return false; } else { @@ -157,11 +157,11 @@ bool ExprToSubfieldFilterParser::toSubfield( } } std::reverse(path.begin(), path.end()); - subfield = common::Subfield(std::move(path)); + subfield = velox::common::Subfield(std::move(path)); return true; } -std::unique_ptr ExprToSubfieldFilterParser::makeNotEqualFilter( +std::unique_ptr ExprToSubfieldFilterParser::makeNotEqualFilter( const core::TypedExprPtr& valueExpr, core::ExpressionEvaluator* evaluator) { auto value = toConstant(valueExpr, evaluator); @@ -169,12 +169,12 @@ std::unique_ptr ExprToSubfieldFilterParser::makeNotEqualFilter( return nullptr; } - std::unique_ptr lessThanFilter = + std::unique_ptr lessThanFilter = makeLessThanFilter(valueExpr, evaluator); if (!lessThanFilter) { return nullptr; } - std::unique_ptr greaterThanFilter = + std::unique_ptr greaterThanFilter = makeGreaterThanFilter(valueExpr, evaluator); if (!greaterThanFilter) { return nullptr; @@ -201,14 +201,14 @@ std::unique_ptr ExprToSubfieldFilterParser::makeNotEqualFilter( } else if (value->typeKind() == TypeKind::HUGEINT) { VELOX_NYI(); } else { - std::vector> filters; + std::vector> filters; filters.emplace_back(std::move(lessThanFilter)); filters.emplace_back(std::move(greaterThanFilter)); - return std::make_unique(std::move(filters), false); + return std::make_unique(std::move(filters), false); } } -std::unique_ptr ExprToSubfieldFilterParser::makeEqualFilter( +std::unique_ptr ExprToSubfieldFilterParser::makeEqualFilter( const core::TypedExprPtr& valueExpr, core::ExpressionEvaluator* evaluator) { auto value = toConstant(valueExpr, evaluator); @@ -237,7 +237,7 @@ std::unique_ptr ExprToSubfieldFilterParser::makeEqualFilter( } } -std::unique_ptr +std::unique_ptr ExprToSubfieldFilterParser::makeGreaterThanFilter( const core::TypedExprPtr& lowerExpr, core::ExpressionEvaluator* evaluator) { @@ -269,7 +269,7 @@ ExprToSubfieldFilterParser::makeGreaterThanFilter( } } -std::unique_ptr ExprToSubfieldFilterParser::makeLessThanFilter( +std::unique_ptr ExprToSubfieldFilterParser::makeLessThanFilter( const core::TypedExprPtr& upperExpr, core::ExpressionEvaluator* evaluator) { auto upper = toConstant(upperExpr, evaluator); @@ -300,7 +300,7 @@ std::unique_ptr ExprToSubfieldFilterParser::makeLessThanFilter( } } -std::unique_ptr +std::unique_ptr ExprToSubfieldFilterParser::makeLessThanOrEqualFilter( const core::TypedExprPtr& upperExpr, core::ExpressionEvaluator* evaluator) { @@ -332,7 +332,7 @@ ExprToSubfieldFilterParser::makeLessThanOrEqualFilter( } } -std::unique_ptr +std::unique_ptr ExprToSubfieldFilterParser::makeGreaterThanOrEqualFilter( const core::TypedExprPtr& lowerExpr, core::ExpressionEvaluator* evaluator) { @@ -364,7 +364,7 @@ ExprToSubfieldFilterParser::makeGreaterThanOrEqualFilter( } } -std::unique_ptr ExprToSubfieldFilterParser::makeInFilter( +std::unique_ptr ExprToSubfieldFilterParser::makeInFilter( const core::TypedExprPtr& expr, core::ExpressionEvaluator* evaluator, bool negated) { @@ -413,7 +413,7 @@ std::unique_ptr ExprToSubfieldFilterParser::makeInFilter( } } -std::unique_ptr ExprToSubfieldFilterParser::makeBetweenFilter( +std::unique_ptr ExprToSubfieldFilterParser::makeBetweenFilter( const core::TypedExprPtr& lowerExpr, const core::TypedExprPtr& upperExpr, core::ExpressionEvaluator* evaluator, @@ -465,10 +465,10 @@ std::unique_ptr ExprToSubfieldFilterParser::makeBetweenFilter( } } -std::unique_ptr +std::unique_ptr PrestoExprToSubfieldFilterParser::leafCallToSubfieldFilter( const core::CallTypedExpr& call, - common::Subfield& subfield, + velox::common::Subfield& subfield, core::ExpressionEvaluator* evaluator, bool negated) { if (call.inputs().empty()) { @@ -528,7 +528,7 @@ PrestoExprToSubfieldFilterParser::leafCallToSubfieldFilter( return nullptr; } -std::pair> toSubfieldFilter( +std::pair> toSubfieldFilter( const core::TypedExprPtr& expr, core::ExpressionEvaluator* evaluator) { if (auto call = asCall(expr.get())) { @@ -540,8 +540,8 @@ std::pair> toSubfieldFilter( std::move(left.first), makeOrFilter(std::move(left.second), std::move(right.second))}; } - common::Subfield subfield; - std::unique_ptr filter; + velox::common::Subfield subfield; + std::unique_ptr filter; if (call->name() == "not") { if (auto* inner = asCall(call->inputs()[0].get())) { filter = diff --git a/velox/expression/ExprToSubfieldFilter.h b/velox/expression/ExprToSubfieldFilter.h index 12ea9b4c2637..4e23214363cd 100644 --- a/velox/expression/ExprToSubfieldFilter.h +++ b/velox/expression/ExprToSubfieldFilter.h @@ -208,10 +208,10 @@ inline std::unique_ptr bigintOr( std::move(filters), nullAllowed); } -inline std::unique_ptr equal( +inline std::unique_ptr equal( const std::string& value, bool nullAllowed = false) { - return std::make_unique( + return std::make_unique( std::vector{value}, nullAllowed); } @@ -281,22 +281,22 @@ inline std::unique_ptr greaterThan( min, false, true, "", true, false, nullAllowed); } -inline std::unique_ptr in( +inline std::unique_ptr in( const std::vector& values, bool nullAllowed = false) { - return common::createBigintValues(values, nullAllowed); + return velox::common::createBigintValues(values, nullAllowed); } -inline std::unique_ptr notIn( +inline std::unique_ptr notIn( const std::vector& values, bool nullAllowed = false) { - return common::createNegatedBigintValues(values, nullAllowed); + return velox::common::createNegatedBigintValues(values, nullAllowed); } -inline std::unique_ptr in( +inline std::unique_ptr in( const std::vector& values, bool nullAllowed = false) { - return std::make_unique(values, nullAllowed); + return std::make_unique(values, nullAllowed); } inline std::unique_ptr notIn( @@ -320,12 +320,12 @@ inline std::unique_ptr isNotNull() { } template -std::unique_ptr +std::unique_ptr orFilter(std::unique_ptr a, std::unique_ptr b, bool nullAllowed = false) { - std::vector> filters; + std::vector> filters; filters.emplace_back(std::move(a)); filters.emplace_back(std::move(b)); - return std::make_unique(std::move(filters), nullAllowed); + return std::make_unique(std::move(filters), nullAllowed); } inline std::unique_ptr lessThanHugeint( @@ -367,7 +367,7 @@ betweenHugeint(int128_t min, int128_t max, bool nullAllowed = false) { return std::make_unique(min, max, nullAllowed); } -std::pair> toSubfieldFilter( +std::pair> toSubfieldFilter( const core::TypedExprPtr& expr, core::ExpressionEvaluator*); @@ -435,56 +435,56 @@ class ExprToSubfieldFilterParser { /// because this conversion is frequently applied when extracting filters from /// remaining filter in readers. Frequent throw clutters logs and slows down /// execution. - virtual std::unique_ptr leafCallToSubfieldFilter( + virtual std::unique_ptr leafCallToSubfieldFilter( const core::CallTypedExpr& call, - common::Subfield& subfield, + velox::common::Subfield& subfield, core::ExpressionEvaluator* evaluator, bool negated = false) = 0; protected: // Converts an expression into a subfield. Returns false if the expression is // not a valid field expression. - bool toSubfield(const core::ITypedExpr* field, common::Subfield& subfield); + bool toSubfield(const core::ITypedExpr* field, velox::common::Subfield& subfield); // Creates a non-equal subfield filter against the given constant. - std::unique_ptr makeNotEqualFilter( + std::unique_ptr makeNotEqualFilter( const core::TypedExprPtr& valueExpr, core::ExpressionEvaluator* evaluator); // Creates an equal subfield filter against the given constant. - std::unique_ptr makeEqualFilter( + std::unique_ptr makeEqualFilter( const core::TypedExprPtr& valueExpr, core::ExpressionEvaluator* evaluator); // Creates a greater-than subfield filter against the given constant. - std::unique_ptr makeGreaterThanFilter( + std::unique_ptr makeGreaterThanFilter( const core::TypedExprPtr& lowerExpr, core::ExpressionEvaluator* evaluator); // Creates a less-than subfield filter against the given constant. - std::unique_ptr makeLessThanFilter( + std::unique_ptr makeLessThanFilter( const core::TypedExprPtr& upperExpr, core::ExpressionEvaluator* evaluator); // Creates a less-than-or-equal subfield filter against the given constant. - std::unique_ptr makeLessThanOrEqualFilter( + std::unique_ptr makeLessThanOrEqualFilter( const core::TypedExprPtr& upperExpr, core::ExpressionEvaluator* evaluator); // Creates a greater-than-or-equal subfield filter against the given constant. - std::unique_ptr makeGreaterThanOrEqualFilter( + std::unique_ptr makeGreaterThanOrEqualFilter( const core::TypedExprPtr& lowerExpr, core::ExpressionEvaluator* evaluator); // Creates an in subfield filter against the given vector. - std::unique_ptr makeInFilter( + std::unique_ptr makeInFilter( const core::TypedExprPtr& expr, core::ExpressionEvaluator* evaluator, bool negated); // Creates a between subfield filter against the given lower and upper // bounds. - std::unique_ptr makeBetweenFilter( + std::unique_ptr makeBetweenFilter( const core::TypedExprPtr& lowerExpr, const core::TypedExprPtr& upperExpr, core::ExpressionEvaluator* evaluator, @@ -499,9 +499,9 @@ class ExprToSubfieldFilterParser { // Parser for Presto expressions. class PrestoExprToSubfieldFilterParser : public ExprToSubfieldFilterParser { public: - std::unique_ptr leafCallToSubfieldFilter( + std::unique_ptr leafCallToSubfieldFilter( const core::CallTypedExpr& call, - common::Subfield& subfield, + velox::common::Subfield& subfield, core::ExpressionEvaluator* evaluator, bool negated = false) override; }; diff --git a/velox/expression/LambdaExpr.cpp b/velox/expression/LambdaExpr.cpp index 8e3f58f987e9..3f9f807f7730 100644 --- a/velox/expression/LambdaExpr.cpp +++ b/velox/expression/LambdaExpr.cpp @@ -288,7 +288,7 @@ void LambdaExpr::makeTypeWithCapture(EvalCtx& context) { void LambdaExpr::extractSubfieldsImpl( folly::F14FastMap* shadowedNames, - std::vector* subfields) const { + std::vector* subfields) const { for (auto& name : signature_->names()) { (*shadowedNames)[name]++; } diff --git a/velox/expression/LambdaExpr.h b/velox/expression/LambdaExpr.h index bba0dc0e85f5..91512398fb33 100644 --- a/velox/expression/LambdaExpr.h +++ b/velox/expression/LambdaExpr.h @@ -63,7 +63,7 @@ class LambdaExpr : public SpecialForm { void extractSubfieldsImpl( folly::F14FastMap* shadowedNames, - std::vector* subfields) const override; + std::vector* subfields) const override; RowTypePtr signature_; diff --git a/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp b/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp index acfb0200ab4b..d351370b4e8b 100644 --- a/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp +++ b/velox/expression/fuzzer/ExpressionFuzzerVerifier.cpp @@ -86,7 +86,7 @@ ExpressionFuzzerVerifier::ExpressionFuzzerVerifier( options_.expressionFuzzerOptions.referenceQueryRunner} { parse::registerTypeResolver(); filesystems::registerLocalFileSystem(); - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); exec::test::registerHiveConnector({}); dwrf::registerDwrfWriterFactory(); diff --git a/velox/expression/tests/ArrayWriterTest.cpp b/velox/expression/tests/ArrayWriterTest.cpp index f62f5fbdf5dd..973dd0601e06 100644 --- a/velox/expression/tests/ArrayWriterTest.cpp +++ b/velox/expression/tests/ArrayWriterTest.cpp @@ -539,7 +539,7 @@ TEST_F(ArrayWriterTest, copyFromNestedArray) { vectorWriter.finish(); using array_type = std::optional>>; - array_type array1 = common::testutil::optionalEmpty; + array_type array1 = velox::common::testutil::optionalEmpty; array_type array2 = {{1, 2, 3, 4}}; array_type array3 = {{1}}; diff --git a/velox/expression/tests/ExprToSubfieldFilterTest.cpp b/velox/expression/tests/ExprToSubfieldFilterTest.cpp index d720275c8213..7af46b513978 100644 --- a/velox/expression/tests/ExprToSubfieldFilterTest.cpp +++ b/velox/expression/tests/ExprToSubfieldFilterTest.cpp @@ -284,9 +284,9 @@ TEST_F(ExprToSubfieldFilterTest, dereferenceWithEmptyField) { class CustomExprToSubfieldFilterParser : public ExprToSubfieldFilterParser { public: - std::unique_ptr leafCallToSubfieldFilter( + std::unique_ptr leafCallToSubfieldFilter( const core::CallTypedExpr& call, - common::Subfield& subfield, + velox::common::Subfield& subfield, core::ExpressionEvaluator* evaluator, bool negated) override { if (call.inputs().empty()) { diff --git a/velox/expression/tests/ExpressionRunner.cpp b/velox/expression/tests/ExpressionRunner.cpp index ae9b572f919e..29cc3e65fc35 100644 --- a/velox/expression/tests/ExpressionRunner.cpp +++ b/velox/expression/tests/ExpressionRunner.cpp @@ -94,7 +94,7 @@ void saveResults( const std::string& directoryPath, const std::string& fileName) { auto path = - common::generateTempFilePath(directoryPath.c_str(), fileName.c_str()); + velox::common::generateTempFilePath(directoryPath.c_str(), fileName.c_str()); VELOX_CHECK( path.has_value(), "Failed to create file for saving result vector in {} directory.", diff --git a/velox/expression/tests/ExpressionRunnerTest.cpp b/velox/expression/tests/ExpressionRunnerTest.cpp index a0087f2d68a6..da5c0dc3d8b7 100644 --- a/velox/expression/tests/ExpressionRunnerTest.cpp +++ b/velox/expression/tests/ExpressionRunnerTest.cpp @@ -285,7 +285,7 @@ int main(int argc, char** argv) { memory::initializeMemoryManager(memory::MemoryManager::Options{}); filesystems::registerLocalFileSystem(); - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); exec::test::registerHiveConnector({}); dwrf::registerDwrfWriterFactory(); diff --git a/velox/expression/tests/ExpressionVerifier.cpp b/velox/expression/tests/ExpressionVerifier.cpp index 3f5270892986..29675eed71bf 100644 --- a/velox/expression/tests/ExpressionVerifier.cpp +++ b/velox/expression/tests/ExpressionVerifier.cpp @@ -635,7 +635,7 @@ void ExpressionVerifier::persistReproInfo( } // Create a new directory - auto dirPath = common::generateTempFolderPath(basePath, "expressionVerifier"); + auto dirPath = velox::common::generateTempFolderPath(basePath, "expressionVerifier"); if (!dirPath.has_value()) { LOG(INFO) << "Failed to create directory for persisting repro info."; return; diff --git a/velox/expression/tests/GenericViewTest.cpp b/velox/expression/tests/GenericViewTest.cpp index b6b0cc81c929..a652a0462881 100644 --- a/velox/expression/tests/GenericViewTest.cpp +++ b/velox/expression/tests/GenericViewTest.cpp @@ -39,7 +39,7 @@ class GenericViewTest : public functions::test::FunctionBaseTest { std::vector>>>; array_data_t arrayData1 = { - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{{{std::nullopt}}}}, {{std::nullopt, 1}}, {{std::nullopt, std::nullopt, std::nullopt}}, @@ -57,7 +57,7 @@ class GenericViewTest : public functions::test::FunctionBaseTest { }; array_data_t arrayData2 = { - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{{{std::nullopt}}}}, {{std::nullopt, 1}}, {{std::nullopt, std::nullopt, std::nullopt}}, diff --git a/velox/expression/tests/TryExprTest.cpp b/velox/expression/tests/TryExprTest.cpp index 2f51e7c22bc2..f0a1b9129bc2 100644 --- a/velox/expression/tests/TryExprTest.cpp +++ b/velox/expression/tests/TryExprTest.cpp @@ -26,7 +26,7 @@ namespace facebook::velox { -using namespace common::testutil; +using namespace velox::common::testutil; using namespace facebook::velox::test; class TryExprTest : public functions::test::FunctionBaseTest { diff --git a/velox/functions/lib/aggregates/DecimalAggregate.h b/velox/functions/lib/aggregates/DecimalAggregate.h index 58b534cda608..b2d7261e187e 100644 --- a/velox/functions/lib/aggregates/DecimalAggregate.h +++ b/velox/functions/lib/aggregates/DecimalAggregate.h @@ -34,7 +34,7 @@ struct LongDecimalWithOverflowState { void mergeWith(const StringView& serializedData) { VELOX_CHECK_EQ(serializedData.size(), serializedSize()); auto serialized = serializedData.data(); - common::InputByteStream stream(serialized); + velox::common::InputByteStream stream(serialized); count += stream.read(); overflow += stream.read(); uint64_t lowerSum = stream.read(); @@ -46,7 +46,7 @@ struct LongDecimalWithOverflowState { void serialize(StringView& serialized) { VELOX_CHECK_EQ(serialized.size(), serializedSize()); char* outputBuffer = const_cast(serialized.data()); - common::OutputByteStream outStream(outputBuffer); + velox::common::OutputByteStream outStream(outputBuffer); outStream.append((char*)&count, sizeof(int64_t)); outStream.append((char*)&overflow, sizeof(int64_t)); uint64_t lower = HugeInt::lower(sum); diff --git a/velox/functions/lib/aggregates/noisy_aggregation/NoisyCountAccumulator.h b/velox/functions/lib/aggregates/noisy_aggregation/NoisyCountAccumulator.h index a4c59b84af07..915a27b60504 100644 --- a/velox/functions/lib/aggregates/noisy_aggregation/NoisyCountAccumulator.h +++ b/velox/functions/lib/aggregates/noisy_aggregation/NoisyCountAccumulator.h @@ -46,13 +46,13 @@ struct NoisyCountAccumulator { } void serialize(char* output) { - common::OutputByteStream stream(output); + velox::common::OutputByteStream stream(output); stream.appendOne(count); stream.appendOne(noiseScale); } static NoisyCountAccumulator deserialize(const char* serialized) { - common::InputByteStream stream(serialized); + velox::common::InputByteStream stream(serialized); auto count = stream.read(); auto noiseScale = stream.read(); diff --git a/velox/functions/lib/aggregates/tests/utils/AggregationTestBase.cpp b/velox/functions/lib/aggregates/tests/utils/AggregationTestBase.cpp index de7114acc35e..08c280231fab 100644 --- a/velox/functions/lib/aggregates/tests/utils/AggregationTestBase.cpp +++ b/velox/functions/lib/aggregates/tests/utils/AggregationTestBase.cpp @@ -69,23 +69,23 @@ std::vector AggregationTestBase::makeVectors( void AggregationTestBase::SetUp() { OperatorTestBase::SetUp(); filesystems::registerLocalFileSystem(); - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); auto hiveConnector = - connector::getConnectorFactory( + connector::common::getConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName) ->newConnector( kHiveConnectorId, std::make_shared( std::unordered_map())); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); dwrf::registerDwrfReaderFactory(); } void AggregationTestBase::TearDown() { dwrf::unregisterDwrfReaderFactory(); - connector::unregisterConnector(kHiveConnectorId); - connector::unregisterConnectorFactory( + connector::common::unregisterConnector(kHiveConnectorId); + connector::common::unregisterConnectorFactory( connector::hive::HiveConnectorFactory::kHiveConnectorName); OperatorTestBase::TearDown(); } diff --git a/velox/functions/lib/tests/QuantileDigestTest.cpp b/velox/functions/lib/tests/QuantileDigestTest.cpp index d0e5f24bd2ac..3e82b7b15bb2 100644 --- a/velox/functions/lib/tests/QuantileDigestTest.cpp +++ b/velox/functions/lib/tests/QuantileDigestTest.cpp @@ -74,7 +74,7 @@ class QuantileDigestTest : public QuantileDigestTestBase { constexpr double kAccuracy = 0.8; std::vector values; QuantileDigest digest{StlAllocator(allocator()), kAccuracy}; - std::default_random_engine gen(common::testutil::getRandomSeed(42)); + std::default_random_engine gen(velox::common::testutil::getRandomSeed(42)); std::uniform_real_distribution dist{-10.0, 10.0}; for (int i = 0; i < N; ++i) { auto v = dist(gen); @@ -103,7 +103,7 @@ class QuantileDigestTest : public QuantileDigestTestBase { QuantileDigest digest1{allocator(), kAccuracy}; // QuantileDigest digest2{allocator(), kAccuracy}; - std::default_random_engine gen(common::testutil::getRandomSeed(42)); + std::default_random_engine gen(velox::common::testutil::getRandomSeed(42)); std::uniform_real_distribution<> dist; for (auto i = 0; i < 100; ++i) { auto v = T(dist(gen)); @@ -142,7 +142,7 @@ class QuantileDigestTest : public QuantileDigestTestBase { QuantileDigest digestEmpty{StlAllocator(allocator()), kAccuracy}; std::vector values; - std::default_random_engine gen(common::testutil::getRandomSeed(42)); + std::default_random_engine gen(velox::common::testutil::getRandomSeed(42)); std::uniform_real_distribution<> dist; for (auto i = 0; i < 100; ++i) { auto v = T(dist(gen)); @@ -177,7 +177,7 @@ class QuantileDigestTest : public QuantileDigestTestBase { std::vector allValues; std::vector digest1Values; - std::default_random_engine gen(common::testutil::getRandomSeed(42)); + std::default_random_engine gen(velox::common::testutil::getRandomSeed(42)); std::uniform_real_distribution<> dist; for (auto i = 0; i < 10000; ++i) { auto v = T(dist(gen)); diff --git a/velox/functions/lib/tests/RepeatTest.cpp b/velox/functions/lib/tests/RepeatTest.cpp index e4c92a834ebf..bf2a77eeb6b7 100644 --- a/velox/functions/lib/tests/RepeatTest.cpp +++ b/velox/functions/lib/tests/RepeatTest.cpp @@ -68,7 +68,7 @@ TEST_F(RepeatTest, repeat) { {{0.0}}, {{-2.0, -2.0}}, {{3.333333, 3.333333, 3.333333}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{std::nullopt, std::nullopt, std::nullopt, std::nullopt}}, std::nullopt, }); diff --git a/velox/functions/lib/tests/TDigestTest.cpp b/velox/functions/lib/tests/TDigestTest.cpp index 440a3a17ab50..d6b187e1e2b7 100644 --- a/velox/functions/lib/tests/TDigestTest.cpp +++ b/velox/functions/lib/tests/TDigestTest.cpp @@ -53,7 +53,7 @@ TEST_F(TDigestTest, addElementsRandomized) { double values[N]; TDigest digest; std::vector positions; - std::default_random_engine gen(common::testutil::getRandomSeed(42)); + std::default_random_engine gen(velox::common::testutil::getRandomSeed(42)); std::uniform_real_distribution<> dist; for (int i = 0; i < N; ++i) { auto v = dist(gen); @@ -311,7 +311,7 @@ TEST_F(TDigestTest, normalDistribution) { constexpr int N = 1e5; std::vector positions; double values[N]; - std::default_random_engine gen(common::testutil::getRandomSeed(42)); + std::default_random_engine gen(velox::common::testutil::getRandomSeed(42)); for (double mean : {0, 1000}) { SCOPED_TRACE(fmt::format("mean={}", mean)); std::normal_distribution<> dist(mean, 1); @@ -342,7 +342,7 @@ TEST_F(TDigestTest, addWeighed) { TEST_F(TDigestTest, merge) { std::vector positions; - std::default_random_engine gen(common::testutil::getRandomSeed(42)); + std::default_random_engine gen(velox::common::testutil::getRandomSeed(42)); std::vector values; std::string buf; auto test = [&](int numDigests, int size, double mean, double stddev) { @@ -416,7 +416,7 @@ TEST_F(TDigestTest, largeScalePreservesWeights) { TDigest digest; std::vector positions; std::normal_distribution normal(1000, 100); - std::default_random_engine gen(common::testutil::getRandomSeed(42)); + std::default_random_engine gen(velox::common::testutil::getRandomSeed(42)); constexpr int N = 1e5; std::vector values; values.reserve(N); diff --git a/velox/functions/prestosql/BinaryFunctions.h b/velox/functions/prestosql/BinaryFunctions.h index a5e86e421449..dbb09c7022b2 100644 --- a/velox/functions/prestosql/BinaryFunctions.h +++ b/velox/functions/prestosql/BinaryFunctions.h @@ -491,7 +491,7 @@ struct Murmur3X64_128Function { FOLLY_ALWAYS_INLINE void call(out_type& result, const arg_type& input) { result.resize(16); - common::hll::Murmur3Hash128::hash( + velox::common::hll::Murmur3Hash128::hash( input.data(), input.size(), 0, result.data()); } }; diff --git a/velox/functions/prestosql/HyperLogLogFunctions.h b/velox/functions/prestosql/HyperLogLogFunctions.h index 8dc85d1ef58f..8b85c3b0ed9c 100644 --- a/velox/functions/prestosql/HyperLogLogFunctions.h +++ b/velox/functions/prestosql/HyperLogLogFunctions.h @@ -30,8 +30,8 @@ struct CardinalityFunction { FOLLY_ALWAYS_INLINE bool call( int64_t& result, const arg_type& hll) { - using common::hll::DenseHll; - using common::hll::SparseHll; + using velox::common::hll::DenseHll; + using velox::common::hll::SparseHll; if (SparseHll::canDeserialize(hll.data())) { result = SparseHll::cardinality(hll.data()); @@ -48,7 +48,7 @@ struct EmptyApproxSetFunction { FOLLY_ALWAYS_INLINE bool call(out_type& result) { static const std::string kEmpty = - common::hll::SparseHll::serializeEmpty(12); + velox::common::hll::SparseHll::serializeEmpty(12); result.resize(kEmpty.size()); memcpy(result.data(), kEmpty.data(), kEmpty.size()); @@ -68,9 +68,9 @@ struct EmptyApproxSetWithMaxErrorFunction { VELOX_USER_CHECK_NOT_NULL( maxStandardError, "empty_approx_set function requires constant value for maxStandardError argument"); - common::hll::checkMaxStandardError(*maxStandardError); - serialized_ = common::hll::SparseHll::serializeEmpty( - common::hll::toIndexBitLength(*maxStandardError)); + velox::common::hll::checkMaxStandardError(*maxStandardError); + serialized_ = velox::common::hll::SparseHll::serializeEmpty( + velox::common::hll::toIndexBitLength(*maxStandardError)); } FOLLY_ALWAYS_INLINE bool call( diff --git a/velox/functions/prestosql/InPredicate.cpp b/velox/functions/prestosql/InPredicate.cpp index 5ae0c6334373..66219e3359cb 100644 --- a/velox/functions/prestosql/InPredicate.cpp +++ b/velox/functions/prestosql/InPredicate.cpp @@ -177,7 +177,7 @@ std::pair, bool> toValues( // no values or only null values. The boolean is true if the list is // non-empty and consists of nulls only. template -std::pair, bool> createBigintValuesFilter( +std::pair, bool> createBigintValuesFilter( const VectorPtr& valuesVector, vector_size_t offset, vector_size_t size) { @@ -199,13 +199,13 @@ std::pair, bool> createBigintValuesFilter( false}; } - return {common::createBigintValues(values, nullAllowed), false}; + return {velox::common::createBigintValues(values, nullAllowed), false}; } // For double, cast double to Int64 and reuse Int64 filters // For float, cast float to Int32 and promote to Int64 template -std::pair, bool> +std::pair, bool> createFloatingPointValuesFilter( const VectorPtr& valuesVector, vector_size_t offset, @@ -243,12 +243,12 @@ createFloatingPointValuesFilter( intValues[i] = reinterpret_cast(values[i]); } } - return {common::createBigintValues(intValues, nullAllowed), false}; + return {velox::common::createBigintValues(intValues, nullAllowed), false}; } // See createBigintValuesFilter. template -std::pair, bool> createHugeintValuesFilter( +std::pair, bool> createHugeintValuesFilter( const VectorPtr& valuesVector, vector_size_t offset, vector_size_t size) { @@ -274,7 +274,7 @@ std::pair, bool> createHugeintValuesFilter( } // See createBigintValuesFilter. -std::pair, bool> createBytesValuesFilter( +std::pair, bool> createBytesValuesFilter( const VectorPtr& valuesVector, vector_size_t offset, vector_size_t size) { @@ -297,14 +297,14 @@ std::pair, bool> createBytesValuesFilter( false}; } - return {std::make_unique(values, nullAllowed), false}; + return {std::make_unique(values, nullAllowed), false}; } /// x IN (2, null) returns null when x != 2 and true when x == 2. /// Null for x always produces null, regardless of 'IN' list. class InPredicate : public exec::VectorFunction { public: - explicit InPredicate(std::unique_ptr filter, bool alwaysNull) + explicit InPredicate(std::unique_ptr filter, bool alwaysNull) : filter_{std::move(filter)}, alwaysNull_(alwaysNull) {} static std::shared_ptr create( @@ -349,7 +349,7 @@ class InPredicate : public exec::VectorFunction { return VectorSetInPredicate::create(elements, offset, size); } - std::pair, bool> filter; + std::pair, bool> filter; switch (elementType->kind()) { case TypeKind::HUGEINT: @@ -588,7 +588,7 @@ class InPredicate : public exec::VectorFunction { } } - const std::unique_ptr filter_; + const std::unique_ptr filter_; const bool alwaysNull_; }; } // namespace diff --git a/velox/functions/prestosql/aggregates/ApproxDistinctAggregate.cpp b/velox/functions/prestosql/aggregates/ApproxDistinctAggregate.cpp index fcd7c808d90d..00aed149ce62 100644 --- a/velox/functions/prestosql/aggregates/ApproxDistinctAggregate.cpp +++ b/velox/functions/prestosql/aggregates/ApproxDistinctAggregate.cpp @@ -132,7 +132,7 @@ void registerApproxDistinctAggregates( false, withCompanionFunctions, overwrite, - common::hll::kDefaultApproxDistinctStandardError); + velox::common::hll::kDefaultApproxDistinctStandardError); // approx_set is companion function for approx_distinct. Don't register // companion functions for it. registerApproxDistinct( @@ -140,7 +140,7 @@ void registerApproxDistinctAggregates( true, false, overwrite, - common::hll::kDefaultApproxSetStandardError); + velox::common::hll::kDefaultApproxSetStandardError); } } // namespace facebook::velox::aggregate::prestosql diff --git a/velox/functions/prestosql/aggregates/ClassificationAggregation.cpp b/velox/functions/prestosql/aggregates/ClassificationAggregation.cpp index e6bd84710d25..9bf643c818da 100644 --- a/velox/functions/prestosql/aggregates/ClassificationAggregation.cpp +++ b/velox/functions/prestosql/aggregates/ClassificationAggregation.cpp @@ -115,7 +115,7 @@ class FixedDoubleHistogram { /// histogram. size_t serialize(char* output) const { VELOX_CHECK(output); - common::OutputByteStream stream(output); + velox::common::OutputByteStream stream(output); size_t bytesUsed = 0; stream.append( reinterpret_cast(&kSerializationVersionHeader), @@ -143,7 +143,7 @@ class FixedDoubleHistogram { /// Merges the current histogram with another histogram represented as a /// buffer. void mergeWith(const char* data, size_t expectedSize) { - auto input = common::InputByteStream(data); + auto input = velox::common::InputByteStream(data); deserialize(*this, input, expectedSize); } @@ -162,7 +162,7 @@ class FixedDoubleHistogram { /// Deserializes the histogram from a buffer. static void deserialize( FixedDoubleHistogram& histogram, - common::InputByteStream& in, + velox::common::InputByteStream& in, size_t expectedSize) { if (FOLLY_UNLIKELY(expectedSize < minDeserializedBufferSize())) { VELOX_USER_FAIL( diff --git a/velox/functions/prestosql/aggregates/HyperLogLogAggregate.h b/velox/functions/prestosql/aggregates/HyperLogLogAggregate.h index e3a697620b4a..87401b8bccbb 100644 --- a/velox/functions/prestosql/aggregates/HyperLogLogAggregate.h +++ b/velox/functions/prestosql/aggregates/HyperLogLogAggregate.h @@ -36,12 +36,12 @@ template inline uint64_t hashOne(T value) { if constexpr (HllAsFinalResult) { if constexpr (std::is_same_v) { - return common::hll::Murmur3Hash128::hash64ForLong(value, 0); + return velox::common::hll::Murmur3Hash128::hash64ForLong(value, 0); } else if constexpr (std::is_same_v) { - return common::hll::Murmur3Hash128::hash64ForLong( + return velox::common::hll::Murmur3Hash128::hash64ForLong( *reinterpret_cast(&value), 0); } - return common::hll::Murmur3Hash128::hash64(&value, sizeof(T), 0); + return velox::common::hll::Murmur3Hash128::hash64(&value, sizeof(T), 0); } else { return XXH64(&value, sizeof(T), 0); } @@ -65,7 +65,7 @@ inline uint64_t hashOne(StringView value) { template <> inline uint64_t hashOne(StringView value) { - return common::hll::Murmur3Hash128::hash64(value.data(), value.size(), 0); + return velox::common::hll::Murmur3Hash128::hash64(value.data(), value.size(), 0); } template @@ -473,11 +473,11 @@ class HyperLogLogAggregate : public exec::Aggregate { } void checkSetMaxStandardError(double error) { - common::hll::checkMaxStandardError(error); + velox::common::hll::checkMaxStandardError(error); if (maxStandardError_ < 0) { maxStandardError_ = error; - indexBitLength_ = common::hll::toIndexBitLength(error); + indexBitLength_ = velox::common::hll::toIndexBitLength(error); } else { VELOX_USER_CHECK_EQ( error, diff --git a/velox/functions/prestosql/aggregates/MergeAggregate.cpp b/velox/functions/prestosql/aggregates/MergeAggregate.cpp index 47aec428220d..61ea70e1ac63 100644 --- a/velox/functions/prestosql/aggregates/MergeAggregate.cpp +++ b/velox/functions/prestosql/aggregates/MergeAggregate.cpp @@ -90,7 +90,7 @@ void registerMergeAggregate( prefix + kMerge, false, overwrite, - common::hll::kDefaultApproxSetStandardError); + velox::common::hll::kDefaultApproxSetStandardError); } } // namespace facebook::velox::aggregate::prestosql diff --git a/velox/functions/prestosql/aggregates/tests/ApproxDistinctTest.cpp b/velox/functions/prestosql/aggregates/tests/ApproxDistinctTest.cpp index d18d81b55448..35d8b2d950a9 100644 --- a/velox/functions/prestosql/aggregates/tests/ApproxDistinctTest.cpp +++ b/velox/functions/prestosql/aggregates/tests/ApproxDistinctTest.cpp @@ -279,30 +279,30 @@ TEST_F(ApproxDistinctTest, globalAggVeryLowCardinalityIntegers) { TEST_F(ApproxDistinctTest, toIndexBitLength) { ASSERT_EQ( - common::hll::toIndexBitLength(common::hll::kHighestMaxStandardError), 4); + velox::common::hll::toIndexBitLength(velox::common::hll::kHighestMaxStandardError), 4); ASSERT_EQ( - common::hll::toIndexBitLength( - common::hll::kDefaultApproxDistinctStandardError), + velox::common::hll::toIndexBitLength( + velox::common::hll::kDefaultApproxDistinctStandardError), 11); ASSERT_EQ( - common::hll::toIndexBitLength( - common::hll::kDefaultApproxSetStandardError), + velox::common::hll::toIndexBitLength( + velox::common::hll::kDefaultApproxSetStandardError), 12); ASSERT_EQ( - common::hll::toIndexBitLength(common::hll::kLowestMaxStandardError), 16); - - ASSERT_EQ(common::hll::toIndexBitLength(0.0325), 10); - ASSERT_EQ(common::hll::toIndexBitLength(0.0324), 11); - ASSERT_EQ(common::hll::toIndexBitLength(0.0230), 11); - ASSERT_EQ(common::hll::toIndexBitLength(0.0229), 12); - ASSERT_EQ(common::hll::toIndexBitLength(0.0163), 12); - ASSERT_EQ(common::hll::toIndexBitLength(0.0162), 13); - ASSERT_EQ(common::hll::toIndexBitLength(0.0115), 13); - ASSERT_EQ(common::hll::toIndexBitLength(0.0114), 14); - ASSERT_EQ(common::hll::toIndexBitLength(0.008125), 14); - ASSERT_EQ(common::hll::toIndexBitLength(0.008124), 15); - ASSERT_EQ(common::hll::toIndexBitLength(0.00575), 15); - ASSERT_EQ(common::hll::toIndexBitLength(0.00574), 16); + velox::common::hll::toIndexBitLength(velox::common::hll::kLowestMaxStandardError), 16); + + ASSERT_EQ(velox::common::hll::toIndexBitLength(0.0325), 10); + ASSERT_EQ(velox::common::hll::toIndexBitLength(0.0324), 11); + ASSERT_EQ(velox::common::hll::toIndexBitLength(0.0230), 11); + ASSERT_EQ(velox::common::hll::toIndexBitLength(0.0229), 12); + ASSERT_EQ(velox::common::hll::toIndexBitLength(0.0163), 12); + ASSERT_EQ(velox::common::hll::toIndexBitLength(0.0162), 13); + ASSERT_EQ(velox::common::hll::toIndexBitLength(0.0115), 13); + ASSERT_EQ(velox::common::hll::toIndexBitLength(0.0114), 14); + ASSERT_EQ(velox::common::hll::toIndexBitLength(0.008125), 14); + ASSERT_EQ(velox::common::hll::toIndexBitLength(0.008124), 15); + ASSERT_EQ(velox::common::hll::toIndexBitLength(0.00575), 15); + ASSERT_EQ(velox::common::hll::toIndexBitLength(0.00574), 16); } TEST_F(ApproxDistinctTest, globalAggIntegersWithError) { @@ -312,33 +312,33 @@ TEST_F(ApproxDistinctTest, globalAggIntegersWithError) { { auto values = makeFlatVector(size, [](auto row) { return row; }); - testGlobalAgg(values, common::hll::kLowestMaxStandardError, 1000); + testGlobalAgg(values, velox::common::hll::kLowestMaxStandardError, 1000); testGlobalAgg(values, 0.01, 1000); testGlobalAgg(values, 0.1, 951); testGlobalAgg(values, 0.2, 936); - testGlobalAgg(values, common::hll::kHighestMaxStandardError, 929); + testGlobalAgg(values, velox::common::hll::kHighestMaxStandardError, 929); values = makeFlatVector(50'000, folly::identity); - testGlobalAgg(values, common::hll::kLowestMaxStandardError, 50043); - testGlobalAgg(values, common::hll::kHighestMaxStandardError, 39069); + testGlobalAgg(values, velox::common::hll::kLowestMaxStandardError, 50043); + testGlobalAgg(values, velox::common::hll::kHighestMaxStandardError, 39069); } // Test approx_set with bigint. { auto values = makeFlatVector(size, [](auto row) { return row; }); - testGlobalAgg(values, common::hll::kLowestMaxStandardError, 1000, true); + testGlobalAgg(values, velox::common::hll::kLowestMaxStandardError, 1000, true); testGlobalAgg(values, 0.01, 1000, true); testGlobalAgg(values, 0.1, 1080, true, 945); testGlobalAgg(values, 0.2, 1340, true, 1028); testGlobalAgg( - values, common::hll::kHighestMaxStandardError, 1814, true, 1034); + values, velox::common::hll::kHighestMaxStandardError, 1814, true, 1034); values = makeFlatVector(50'000, folly::identity); testGlobalAgg( - values, common::hll::kLowestMaxStandardError, 50060, true, 50284); + values, velox::common::hll::kLowestMaxStandardError, 50060, true, 50284); testGlobalAgg( - values, common::hll::kHighestMaxStandardError, 45437, true, 40037); + values, velox::common::hll::kHighestMaxStandardError, 45437, true, 40037); } } @@ -410,8 +410,8 @@ TEST_F(ApproxDistinctTest, hugeInt) { auto hugeIntValues = makeFlatVector(50000, [](auto row) { return row; }); testGlobalAgg(hugeIntValues, 49669); - testGlobalAgg(hugeIntValues, common::hll::kLowestMaxStandardError, 50110); - testGlobalAgg(hugeIntValues, common::hll::kHighestMaxStandardError, 41741); + testGlobalAgg(hugeIntValues, velox::common::hll::kLowestMaxStandardError, 50110); + testGlobalAgg(hugeIntValues, velox::common::hll::kHighestMaxStandardError, 41741); } TEST_F(ApproxDistinctTest, streaming) { diff --git a/velox/functions/prestosql/aggregates/tests/ChecksumAggregateTest.cpp b/velox/functions/prestosql/aggregates/tests/ChecksumAggregateTest.cpp index f1821a89408f..7bd3877674fc 100644 --- a/velox/functions/prestosql/aggregates/tests/ChecksumAggregateTest.cpp +++ b/velox/functions/prestosql/aggregates/tests/ChecksumAggregateTest.cpp @@ -283,7 +283,7 @@ TEST_F(ChecksumAggregateTest, arrays) { assertChecksum(arrayVector, "Nlzernkj88A="); arrayVector = makeNullableArrayVector( - {{{1, 2}}, std::nullopt, common::testutil::optionalEmpty}); + {{{1, 2}}, std::nullopt, velox::common::testutil::optionalEmpty}); assertChecksum(arrayVector, "Nlzernkj88A="); // Array of arrays. diff --git a/velox/functions/prestosql/aggregates/tests/ClassificationAggregationTest.cpp b/velox/functions/prestosql/aggregates/tests/ClassificationAggregationTest.cpp index 22c7f6dc59a5..f63ca4a653b9 100644 --- a/velox/functions/prestosql/aggregates/tests/ClassificationAggregationTest.cpp +++ b/velox/functions/prestosql/aggregates/tests/ClassificationAggregationTest.cpp @@ -119,7 +119,7 @@ TEST_F(ClassificationAggregationTest, basic) { expected = makeRowVector({makeNullableArrayVector( std::vector>>>{ - common::testutil::optionalEmpty})}); + velox::common::testutil::optionalEmpty})}); runTest("classification_fall_out(5, c0, c1)", input, expected); runTest("classification_precision(5, c0, c1)", input, expected); runTest("classification_recall(5, c0, c1)", input, expected); diff --git a/velox/functions/prestosql/aggregates/tests/MapUnionAggregationTest.cpp b/velox/functions/prestosql/aggregates/tests/MapUnionAggregationTest.cpp index 33b819fde4f3..064fa862d51c 100644 --- a/velox/functions/prestosql/aggregates/tests/MapUnionAggregationTest.cpp +++ b/velox/functions/prestosql/aggregates/tests/MapUnionAggregationTest.cpp @@ -256,7 +256,7 @@ TEST_F(MapUnionTest, nulls) { makeNullableMapVector({ {{{1, 10}, {2, 20}, {3, 33}, {4, 44}, {5, 55}}}, std::nullopt, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, }), }); diff --git a/velox/functions/prestosql/aggregates/tests/MapUnionSumTest.cpp b/velox/functions/prestosql/aggregates/tests/MapUnionSumTest.cpp index 0f90e9ac6962..585d62f5831f 100644 --- a/velox/functions/prestosql/aggregates/tests/MapUnionSumTest.cpp +++ b/velox/functions/prestosql/aggregates/tests/MapUnionSumTest.cpp @@ -31,7 +31,7 @@ class MapUnionSumTest : public AggregationTestBase {}; TEST_F(MapUnionSumTest, global) { auto data = makeRowVector({ makeNullableMapVector({ - common::testutil::optionalEmpty, // empty map + velox::common::testutil::optionalEmpty, // empty map std::nullopt, // null map {{{1, 10}, {2, 20}}}, {{{1, 11}, {3, 30}, {4, 40}}}, @@ -63,7 +63,7 @@ TEST_F(MapUnionSumTest, globalVarcharKey) { auto data = makeRowVector({ makeNullableMapVector({ - common::testutil::optionalEmpty, // empty map + velox::common::testutil::optionalEmpty, // empty map std::nullopt, // null map {{{keys[0], 10}, {keys[1], 20}}}, {{{keys[0], 11}, {keys[2], 30}, {keys[3], 40}}}, @@ -120,9 +120,9 @@ TEST_F(MapUnionSumTest, nullAndEmptyMaps) { auto emptyAndNullMaps = makeRowVector({ makeNullableMapVector({ std::nullopt, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, std::nullopt, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, }), }); diff --git a/velox/functions/prestosql/aggregates/tests/PrestoHasherTest.cpp b/velox/functions/prestosql/aggregates/tests/PrestoHasherTest.cpp index 2f38263e52de..e5578d40ac00 100644 --- a/velox/functions/prestosql/aggregates/tests/PrestoHasherTest.cpp +++ b/velox/functions/prestosql/aggregates/tests/PrestoHasherTest.cpp @@ -294,7 +294,7 @@ TEST_F(PrestoHasherTest, arrays) { {{10, 11}}, {{12, std::nullopt}}, std::nullopt, - common::testutil::optionalEmpty}); + velox::common::testutil::optionalEmpty}); assertHash( baseArrayVector, @@ -321,7 +321,7 @@ TEST_F(PrestoHasherTest, arrays) { {{std::nullopt}}, {{1, 2, 3}}, {{1024, std::nullopt, -99, -999}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{std::nullopt, -1}}, }); diff --git a/velox/functions/prestosql/fuzzer/ApproxDistinctResultVerifier.h b/velox/functions/prestosql/fuzzer/ApproxDistinctResultVerifier.h index 3d0e2eb43e6c..45e075ba36ed 100644 --- a/velox/functions/prestosql/fuzzer/ApproxDistinctResultVerifier.h +++ b/velox/functions/prestosql/fuzzer/ApproxDistinctResultVerifier.h @@ -228,7 +228,7 @@ class ApproxDistinctResultVerifier : public ResultVerifier { // standard error when the numGroups >= 50 and the error bound is smaller // than or equan to the default error bound. bool checkError = - (error_ <= common::hll::kDefaultApproxDistinctStandardError || + (error_ <= velox::common::hll::kDefaultApproxDistinctStandardError || numGroups >= 50); for (auto i = 0; i < numGroups; ++i) { const auto gap = diff --git a/velox/functions/prestosql/tests/ArrayCombinationsTest.cpp b/velox/functions/prestosql/tests/ArrayCombinationsTest.cpp index a3389d25510e..b3cd0f49e9d2 100644 --- a/velox/functions/prestosql/tests/ArrayCombinationsTest.cpp +++ b/velox/functions/prestosql/tests/ArrayCombinationsTest.cpp @@ -43,7 +43,7 @@ class ArrayCombinationsTest : public FunctionBaseTest { {{{{std::vector>()}}}, {{{{0, 1, 2}}, {{0, 1, 3}}, {{0, 2, 3}}, {{1, 2, 3}}}}, {{{{0, 1, 2, 3}}}}, - common::testutil::optionalEmpty}); + velox::common::testutil::optionalEmpty}); testExpr( expected, "combinations(C0, C1)", {arrayVector, comboLengthVector}); } @@ -69,7 +69,7 @@ class ArrayCombinationsTest : public FunctionBaseTest { {{ {{0, 1, std::nullopt, 3}}, }}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, }); testExpr( expected, "combinations(C0, C1)", {arrayVector, comboLengthVector}); @@ -176,7 +176,7 @@ TEST_F(ArrayCombinationsTest, inlineVarcharArrays) { {{"bb", "aa", "aa", "ddd"}}, {{"bb", "cc", "aa", "ddd"}}, {{"aa", "cc", "aa", "ddd"}}}}, - common::testutil::optionalEmpty}); + velox::common::testutil::optionalEmpty}); testExpr(expected, "combinations(C0, C1)", {arrayVector, comboLengthVector}); } @@ -223,7 +223,7 @@ TEST_F(ArrayCombinationsTest, varcharArrays) { "yellow rose flowers", "red shiny car ahead", "purple is an elegant color"}}}}, - common::testutil::optionalEmpty}); + velox::common::testutil::optionalEmpty}); testExpr(expected, "combinations(C0, C1)", {arrayVector, comboLengthVector}); } @@ -246,7 +246,7 @@ TEST_F(ArrayCombinationsTest, boolNullableArrays) { {{false, true, true, true}}, {{false, false, true, true}}, {{true, false, true, true}}}}, - common::testutil::optionalEmpty}); + velox::common::testutil::optionalEmpty}); testExpr(expected, "combinations(C0, C1)", {arrayVector, comboLengthVector}); } @@ -269,6 +269,6 @@ TEST_F(ArrayCombinationsTest, boolArrays) { {{false, true, true, true}}, {{false, false, true, true}}, {{true, false, true, true}}}}, - common::testutil::optionalEmpty}); + velox::common::testutil::optionalEmpty}); testExpr(expected, "combinations(C0, C1)", {arrayVector, comboLengthVector}); } diff --git a/velox/functions/prestosql/tests/ArrayIntersectTest.cpp b/velox/functions/prestosql/tests/ArrayIntersectTest.cpp index 971d86f2ae4a..af6cbbe2b182 100644 --- a/velox/functions/prestosql/tests/ArrayIntersectTest.cpp +++ b/velox/functions/prestosql/tests/ArrayIntersectTest.cpp @@ -66,7 +66,7 @@ class ArrayIntersectTest : public FunctionBaseTest { {{3, 8, std::nullopt}}, std::nullopt, {{1, 1, -2, -2, -2, 4, 8}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, }); auto array2 = makeNullableArrayVector({ {1, -2, 4}, @@ -79,10 +79,10 @@ class ArrayIntersectTest : public FunctionBaseTest { auto expected = makeNullableArrayVector({ {{1, -2, 4}}, {{1, -2}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, std::nullopt, {{1, -2, 4}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, }); testExpr(expected, "array_intersect(C0, C1)", {array1, array2}); testExpr(expected, "array_intersect(C1, C0)", {array1, array2}); @@ -97,12 +97,12 @@ class ArrayIntersectTest : public FunctionBaseTest { {{1, std::nullopt}}, }); expected = makeNullableArrayVector({ - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{2, -2}}, {std::vector>{std::nullopt}}, std::nullopt, {{1, 8}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, }); testExpr(expected, "array_intersect(C0, C1)", {array1, array2}); } @@ -202,7 +202,7 @@ class ArrayIntersectTest : public FunctionBaseTest { outerArrayType row1{{a1}, {a2}}; outerArrayType row2{{b1}, {b2}}; outerArrayType row3{{c1}, {c2}}; - outerArrayType row4{{a1}, common::testutil::optionalEmpty}; + outerArrayType row4{{a1}, velox::common::testutil::optionalEmpty}; auto arrayVector = makeNullableNestedArrayVector({{row1}, {row2}, {row3}, {row4}}); auto expected = makeNullableArrayVector( @@ -293,7 +293,7 @@ TEST_F(ArrayIntersectTest, boolNestedArrays) { outerArrayType row1{{a1}, {a2}}; outerArrayType row2{{b1}, {b2}, {b3}}; outerArrayType row3{{c1}, {c2}}; - outerArrayType row4{{a1}, common::testutil::optionalEmpty}; + outerArrayType row4{{a1}, velox::common::testutil::optionalEmpty}; auto arrayVector = makeNullableNestedArrayVector({{row1}, {row2}, {row3}, {row4}}); auto expected = makeNullableArrayVector( @@ -345,7 +345,7 @@ TEST_F(ArrayIntersectTest, strNestedArrays) { outerArrayType row1{{a1}, {a2}}; outerArrayType row2{{b1}, {b2}, {b3}}; outerArrayType row3{{c1}, {c2}}; - outerArrayType row4{{a1}, common::testutil::optionalEmpty}; + outerArrayType row4{{a1}, velox::common::testutil::optionalEmpty}; auto arrayVector = makeNullableNestedArrayVector( {{row1}, {row2}, {row3}, {row4}}); auto expected = makeNullableArrayVector( @@ -417,7 +417,7 @@ TEST_F(ArrayIntersectTest, longStrNestedArrays) { outerArrayType row1{{a1}, {a2}}; outerArrayType row2{{b1}, {b2}, {b3}}; outerArrayType row3{{c1}, {c2}}; - outerArrayType row4{{a1}, common::testutil::optionalEmpty}; + outerArrayType row4{{a1}, velox::common::testutil::optionalEmpty}; auto arrayVector = makeNullableNestedArrayVector( {{row1}, {row2}, {row3}, {row4}}); auto expected = makeNullableArrayVector( diff --git a/velox/functions/prestosql/tests/ComparisonsTest.cpp b/velox/functions/prestosql/tests/ComparisonsTest.cpp index b3655e2f9f31..1a80ec63b326 100644 --- a/velox/functions/prestosql/tests/ComparisonsTest.cpp +++ b/velox/functions/prestosql/tests/ComparisonsTest.cpp @@ -501,7 +501,7 @@ TEST_F(ComparisonsTest, eqNeqArray) { test(std::nullopt, {{1}}, std::nullopt); test({{1}}, std::nullopt, std::nullopt); - test(common::testutil::optionalEmpty, common::testutil::optionalEmpty, true); + test(velox::common::testutil::optionalEmpty, velox::common::testutil::optionalEmpty, true); test({{1, 2, 3}}, {{1, 2, 3}}, true); test({{1, 2, 3}}, {{1, 2, 4}}, false); @@ -513,7 +513,7 @@ TEST_F(ComparisonsTest, eqNeqArray) { test({{1, std::nullopt}}, {{1, 2}}, std::nullopt); // Different size arrays. - test(common::testutil::optionalEmpty, {{std::nullopt, std::nullopt}}, false); + test(velox::common::testutil::optionalEmpty, {{std::nullopt, std::nullopt}}, false); test({{1, 2}}, {{1, 2, std::nullopt}}, false); test( {{std::nullopt, std::nullopt}}, @@ -554,7 +554,7 @@ TEST_F(ComparisonsTest, eqNeqMap) { // Elements checked in sorted order. test({{{3, 4}, {1, 2}}}, {{{1, 2}, {3, 4}}}, true); - test(common::testutil::optionalEmpty, common::testutil::optionalEmpty, true); + test(velox::common::testutil::optionalEmpty, velox::common::testutil::optionalEmpty, true); test({{{1, 2}, {3, 5}}}, {{{1, 2}, {3, 4}}}, false); @@ -778,7 +778,7 @@ TEST_F(ComparisonsTest, eqNestedComplex) { // Compare Row(Array>, int, Map) using array_type = std::optional>>; array_type array1 = {{1, 2}}; - array_type array2 = common::testutil::optionalEmpty; + array_type array2 = velox::common::testutil::optionalEmpty; array_type array3 = {{1, 100, 2}}; auto vector1 = diff --git a/velox/functions/prestosql/tests/InPredicateTest.cpp b/velox/functions/prestosql/tests/InPredicateTest.cpp index 0ea2768c9dce..d8a0cb42118d 100644 --- a/velox/functions/prestosql/tests/InPredicateTest.cpp +++ b/velox/functions/prestosql/tests/InPredicateTest.cpp @@ -954,7 +954,7 @@ TEST_F(InPredicateTest, arrays) { auto data = makeRowVector({ makeNullableArrayVector({ {{1, 2, 3}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{1, 3}}, std::nullopt, {{2, 4, 5, 6}}, diff --git a/velox/functions/prestosql/tests/JsonCastTest.cpp b/velox/functions/prestosql/tests/JsonCastTest.cpp index 52060bf5c09a..38e42d7ba28e 100644 --- a/velox/functions/prestosql/tests/JsonCastTest.cpp +++ b/velox/functions/prestosql/tests/JsonCastTest.cpp @@ -1189,7 +1189,7 @@ TEST_F(JsonCastTest, toArray) { auto expected = makeNullableArrayVector( {{{"red"_sv, "blue"_sv}}, {{std::nullopt, std::nullopt, "purple"_sv}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, std::nullopt}); testCast(data, expected); @@ -1218,7 +1218,7 @@ TEST_F(JsonCastTest, toMap) { auto expected = makeNullableMapVector( {{{{"blue"_sv, "2.2"_sv}, {"red"_sv, "1"_sv}}}, {{{"purple"_sv, std::nullopt}, {"yellow"_sv, "4"_sv}}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, std::nullopt}); testCast(data, expected); @@ -1233,7 +1233,7 @@ TEST_F(JsonCastTest, toMap) { expected = makeNullableMapVector( {{{{101, 1.1}, {102, 2.0}}}, {{{103, std::nullopt}, {104, 4.0}}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, std::nullopt}); testCast(data, expected); @@ -1418,7 +1418,7 @@ TEST_F(JsonCastTest, toNested) { {{{{{"1"_sv, "2"_sv}}, {{"3"_sv}}}}, {{{{std::nullopt, std::nullopt, "4"_sv}}}}, {{common::testutil::optionalEmpty}}, - common::testutil::optionalEmpty}); + velox::common::testutil::optionalEmpty}); testCast(array, arrayExpected); diff --git a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp index bd018163143e..265e613a71ed 100644 --- a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp @@ -1163,7 +1163,7 @@ TEST_F(JsonFunctionsTest, jsonStringToArrayCast) { auto expected = makeNullableArrayVector( {{{"red"_sv, "blue"_sv}}, {{std::nullopt, std::nullopt, "purple"_sv}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, std::nullopt}); checkInternalFn( diff --git a/velox/functions/prestosql/tests/MapFilterTest.cpp b/velox/functions/prestosql/tests/MapFilterTest.cpp index 61ca65b9f495..2d1e48f7095e 100644 --- a/velox/functions/prestosql/tests/MapFilterTest.cpp +++ b/velox/functions/prestosql/tests/MapFilterTest.cpp @@ -270,7 +270,7 @@ TEST_F(MapFilterTest, try) { {{{{1, 2}, {2, 3}}}, std::nullopt, {{{7, 8}}}, - common::testutil::optionalEmpty}); + velox::common::testutil::optionalEmpty}); assertEqualVectors(expected, result); } diff --git a/velox/functions/prestosql/tests/ZipTest.cpp b/velox/functions/prestosql/tests/ZipTest.cpp index 92e9166a9980..92ac4870dc8e 100644 --- a/velox/functions/prestosql/tests/ZipTest.cpp +++ b/velox/functions/prestosql/tests/ZipTest.cpp @@ -92,7 +92,7 @@ TEST_F(ZipTest, combineInt) { TEST_F(ZipTest, nullEmptyArray) { auto firstVector = makeNullableArrayVector({ {{1, 1, 1, 1}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, std::nullopt, }); diff --git a/velox/functions/sparksql/tests/ArraySortTestData.h b/velox/functions/sparksql/tests/ArraySortTestData.h index 38de4684e346..972f761d68f5 100644 --- a/velox/functions/sparksql/tests/ArraySortTestData.h +++ b/velox/functions/sparksql/tests/ArraySortTestData.h @@ -240,7 +240,7 @@ arrayInput() { using A = std::vector>; return std::vector>>>{ // Empty. - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, // All nulls. {{std::nullopt, std::nullopt}}, // Same prefix. @@ -257,7 +257,7 @@ inline std::vector>; return std::vector>>>{ - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{std::nullopt, std::nullopt}}, {{A({1, 3}), A({1, 3, 5}), A({2, 1})}}, {{std::nullopt, A({1, 3}), A({2, 1})}}, @@ -270,7 +270,7 @@ inline std::vector>; return std::vector>>>{ - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{std::nullopt, std::nullopt}}, {{A({1, 3}), A({1, 3, 5}), A({2, 1})}}, {{A({1, 3}), A({2, 1}), std::nullopt}}, diff --git a/velox/functions/sparksql/tests/HashTest.cpp b/velox/functions/sparksql/tests/HashTest.cpp index ff506f2ddd69..2a0305c041d4 100644 --- a/velox/functions/sparksql/tests/HashTest.cpp +++ b/velox/functions/sparksql/tests/HashTest.cpp @@ -234,7 +234,7 @@ TEST_F(HashTest, map) { auto mapWithNullArrays = createMapOfArraysVector( {{{1, std::nullopt}}, {{2, {{4, 5, std::nullopt}}}}, - {{3, common::testutil::optionalEmpty}}}); + {{3, velox::common::testutil::optionalEmpty}}}); assertEqualVectors( makeFlatVector({-1712319331, 2060637564, 519220707}), hash(mapWithNullArrays)); diff --git a/velox/functions/sparksql/tests/XxHash64Test.cpp b/velox/functions/sparksql/tests/XxHash64Test.cpp index c375a7a1d302..05a864e5f2f0 100644 --- a/velox/functions/sparksql/tests/XxHash64Test.cpp +++ b/velox/functions/sparksql/tests/XxHash64Test.cpp @@ -247,7 +247,7 @@ TEST_F(XxHash64Test, map) { auto mapWithNullArrays = createMapOfArraysVector( {{{1, std::nullopt}}, {{2, {{4, 5, std::nullopt}}}}, - {{3, common::testutil::optionalEmpty}}}); + {{3, velox::common::testutil::optionalEmpty}}}); assertEqualVectors( makeFlatVector( {-7001672635703045582, 7217681953522744649, 3188756510806108107}), diff --git a/velox/python/init/PyInit.cpp b/velox/python/init/PyInit.cpp index 8734fb6dd78d..655ff4053b37 100644 --- a/velox/python/init/PyInit.cpp +++ b/velox/python/init/PyInit.cpp @@ -46,7 +46,7 @@ void registerAllResourcesOnce() { velox::core::PlanNode::registerSerDe(); velox::Type::registerSerDe(); velox::common::Filter::registerSerDe(); - velox::connector::hive::LocationHandle::registerSerDe(); + velox::connector::hive::HiveLocationHandle::registerSerDe(); velox::connector::hive::HiveSortingColumn::registerSerDe(); velox::connector::hive::HiveBucketProperty::registerSerDe(); velox::connector::hive::HiveTableHandle::registerSerDe(); diff --git a/velox/python/plan_builder/PyPlanBuilder.cpp b/velox/python/plan_builder/PyPlanBuilder.cpp index ab581fc0badd..09e2c0598b4d 100644 --- a/velox/python/plan_builder/PyPlanBuilder.cpp +++ b/velox/python/plan_builder/PyPlanBuilder.cpp @@ -140,13 +140,13 @@ PyPlanBuilder& PyPlanBuilder::tableScan( if (!subfields.empty() || !rowIndexColumnName.empty()) { std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments; for (size_t i = 0; i < outputRowSchema->size(); ++i) { auto name = outputRowSchema->nameOf(i); auto type = outputRowSchema->childAt(i); - std::vector requiredSubfields; + std::vector requiredSubfields; py::object key = py::cast(name); @@ -197,7 +197,7 @@ PyPlanBuilder& PyPlanBuilder::tableScan( .endTableScan(); // Store the id of the scan and the respective splits. - std::vector> splits; + std::vector> splits; if (inputFiles.has_value()) { for (const auto& inputFile : *inputFiles) { splits.push_back(std::make_shared( @@ -340,7 +340,7 @@ PyPlanBuilder& PyPlanBuilder::tpchGen( connectorId); // Generate one split per part. - std::vector> splits; + std::vector> splits; for (size_t i = 0; i < numParts; ++i) { splits.push_back(std::make_shared( connectorId, numParts, i)); diff --git a/velox/python/plan_builder/PyPlanBuilder.h b/velox/python/plan_builder/PyPlanBuilder.h index 219f79fca4a3..9209ee3f9212 100644 --- a/velox/python/plan_builder/PyPlanBuilder.h +++ b/velox/python/plan_builder/PyPlanBuilder.h @@ -29,7 +29,7 @@ class PyVector; // node id to a list of splits. using TScanFiles = std::unordered_map< core::PlanNodeId, - std::vector>>; + std::vector>>; using TQueryConfigs = std::unordered_map; /// Stores the context for a particular plan generation, since a single plan may diff --git a/velox/python/runner/PyConnectors.cpp b/velox/python/runner/PyConnectors.cpp index ad5e5b416eed..82b47ca72a14 100644 --- a/velox/python/runner/PyConnectors.cpp +++ b/velox/python/runner/PyConnectors.cpp @@ -30,16 +30,16 @@ template void registerConnector( const std::string& connectorId, std::unordered_map configs) { - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared(connectorId.data())); const auto configBase = std::make_shared(std::move(configs)); auto connector = - connector::getConnectorFactory(connectorId) + connector::common::getConnectorFactory(connectorId) ->newConnector( connectorId, configBase, folly::getGlobalCPUExecutor().get()); - connector::registerConnector(connector); + connector::common::registerConnector(connector); connectorRegistry().insert(connectorId); } @@ -61,8 +61,8 @@ void registerTpch( // Is it ok to unregister connectors that were not registered. void unregister(const std::string& connectorId) { - if (!facebook::velox::connector::unregisterConnector(connectorId) || - !facebook::velox::connector::unregisterConnectorFactory(connectorId)) { + if (!facebook::velox::connector::common::unregisterConnector(connectorId) || + !facebook::velox::connector::common::unregisterConnectorFactory(connectorId)) { throw std::runtime_error( fmt::format("Unable to unregister connector '{}'", connectorId)); } diff --git a/velox/row/tests/CompactRowTest.cpp b/velox/row/tests/CompactRowTest.cpp index bc77dab70728..b39b12064e01 100644 --- a/velox/row/tests/CompactRowTest.cpp +++ b/velox/row/tests/CompactRowTest.cpp @@ -191,7 +191,7 @@ TEST_F(CompactRowTest, rowSizeArrayOfBigint) { makeNullableArrayVector({ {{1, 2, std::nullopt, 3}}, {{4, 5}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, std::nullopt, {{6}}, }), @@ -250,7 +250,7 @@ TEST_F(CompactRowTest, rowSizeArrayOfStrings) { data = makeRowVector({ makeNullableArrayVector({ {{"a", "Abc", std::nullopt}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, std::nullopt, {{"a", std::nullopt, "Longer string", "abc"}}, }), @@ -405,7 +405,7 @@ TEST_F(CompactRowTest, arrayOfBigint) { {{std::nullopt, 6}}, {{std::nullopt}}, std::nullopt, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, }), }); @@ -431,7 +431,7 @@ TEST_F(CompactRowTest, arrayOfTimestamp) { {{std::nullopt, ts(6)}}, {{std::nullopt}}, std::nullopt, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, }), }); @@ -458,7 +458,7 @@ TEST_F(CompactRowTest, arrayOfString) { "Abc 12345 ...test", std::nullopt, "foo"}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{std::nullopt}}, std::nullopt, }), diff --git a/velox/runner/LocalRunner.h b/velox/runner/LocalRunner.h index 22efbe5de6dd..c11f97071992 100644 --- a/velox/runner/LocalRunner.h +++ b/velox/runner/LocalRunner.h @@ -15,7 +15,7 @@ */ #pragma once -#include "velox/connectors/Connector.h" +#include "velox/connectors/common/Connector.h" #include "velox/exec/Cursor.h" #include "velox/exec/Exchange.h" #include "velox/runner/MultiFragmentPlan.h" @@ -28,13 +28,13 @@ namespace facebook::velox::runner { class SimpleSplitSource : public SplitSource { public: explicit SimpleSplitSource( - std::vector> splits) + std::vector> splits) : splits_(std::move(splits)) {} virtual std::vector getSplits(uint64_t targetBytes) override; private: - std::vector> splits_; + std::vector> splits_; int32_t splitIdx_{0}; }; @@ -45,7 +45,7 @@ class SimpleSplitSourceFactory : public SplitSourceFactory { explicit SimpleSplitSourceFactory( std::unordered_map< core::PlanNodeId, - std::vector>> nodeSplitMap) + std::vector>> nodeSplitMap) : nodeSplitMap_(std::move(nodeSplitMap)) {} std::shared_ptr splitSourceForScan( @@ -54,7 +54,7 @@ class SimpleSplitSourceFactory : public SplitSourceFactory { private: std::unordered_map< core::PlanNodeId, - std::vector>> + std::vector>> nodeSplitMap_; }; diff --git a/velox/runner/Runner.h b/velox/runner/Runner.h index 21a1a00ce068..ff0b6dc52162 100644 --- a/velox/runner/Runner.h +++ b/velox/runner/Runner.h @@ -15,7 +15,7 @@ */ #pragma once -#include "velox/connectors/Connector.h" +#include "velox/connectors/common/Connector.h" #include "velox/exec/Cursor.h" #include "velox/exec/Exchange.h" #include "velox/runner/MultiFragmentPlan.h" @@ -34,7 +34,7 @@ class SplitSource { /// group means that there are on more splits for the group. In ungrouped /// execution, the group is kUngroupedGroupId. struct SplitAndGroup { - std::shared_ptr split; + std::shared_ptr split; uint32_t group{kUngroupedGroupId}; }; diff --git a/velox/serializers/PrestoBatchVectorSerializer.cpp b/velox/serializers/PrestoBatchVectorSerializer.cpp index cb1c1ad647d7..f0ab1780d5ff 100644 --- a/velox/serializers/PrestoBatchVectorSerializer.cpp +++ b/velox/serializers/PrestoBatchVectorSerializer.cpp @@ -48,7 +48,7 @@ void PrestoBatchVectorSerializer::serialize( !inUse.exchange(true), "PrestoBatchVectorSerializer::serialize being called concurrently on the same object."); - common::testutil::TestValue::adjust( + velox::common::testutil::TestValue::adjust( "facebook::velox::serializers::PrestoBatchVectorSerializer::serialize", this); diff --git a/velox/serializers/PrestoBatchVectorSerializer.h b/velox/serializers/PrestoBatchVectorSerializer.h index c7828cc9d97a..ddbf64047f2a 100644 --- a/velox/serializers/PrestoBatchVectorSerializer.h +++ b/velox/serializers/PrestoBatchVectorSerializer.h @@ -26,7 +26,7 @@ class PrestoBatchVectorSerializer : public BatchVectorSerializer { memory::MemoryPool* pool, const PrestoVectorSerde::PrestoOptions& opts) : pool_(pool), - codec_(common::compressionKindToCodec(opts.compressionKind)), + codec_(velox::common::compressionKindToCodec(opts.compressionKind)), opts_(opts) {} void serialize( diff --git a/velox/serializers/PrestoIterativeVectorSerializer.cpp b/velox/serializers/PrestoIterativeVectorSerializer.cpp index de73cd7f1ccf..383f2dbf2258 100644 --- a/velox/serializers/PrestoIterativeVectorSerializer.cpp +++ b/velox/serializers/PrestoIterativeVectorSerializer.cpp @@ -25,7 +25,7 @@ PrestoIterativeVectorSerializer::PrestoIterativeVectorSerializer( const PrestoVectorSerde::PrestoOptions& opts) : opts_(opts), streamArena_(streamArena), - codec_(common::compressionKindToCodec(opts.compressionKind)), + codec_(velox::common::compressionKindToCodec(opts.compressionKind)), streams_(memory::StlAllocator(*streamArena->pool())) { const auto types = rowType->children(); const auto numTypes = types.size(); @@ -92,8 +92,8 @@ void PrestoIterativeVectorSerializer::flush(OutputStream* out) { out); } else { if (numCompressionToSkip_ > 0) { - const auto noCompressionCodec = common::compressionKindToCodec( - common::CompressionKind::CompressionKind_NONE); + const auto noCompressionCodec = velox::common::compressionKindToCodec( + velox::common::CompressionKind::CompressionKind_NONE); auto [size, ignore] = flushStreams( streams_, numRows_, *streamArena_, *noCompressionCodec, 1, out); stats_.compressionSkippedBytes += size; diff --git a/velox/serializers/PrestoSerializer.cpp b/velox/serializers/PrestoSerializer.cpp index 2217c2eed226..12e04415ae8e 100644 --- a/velox/serializers/PrestoSerializer.cpp +++ b/velox/serializers/PrestoSerializer.cpp @@ -138,7 +138,7 @@ void PrestoVectorSerde::deserialize( const Options* options) { const auto prestoOptions = toPrestoOptions(options); const auto codec = - common::compressionKindToCodec(prestoOptions.compressionKind); + velox::common::compressionKindToCodec(prestoOptions.compressionKind); auto maybeHeader = detail::PrestoHeader::read(source); VELOX_CHECK( maybeHeader.hasValue(), @@ -205,7 +205,7 @@ void PrestoVectorSerde::deserializeSingleColumn( const auto prestoOptions = toPrestoOptions(options); VELOX_CHECK_EQ( prestoOptions.compressionKind, - common::CompressionKind::CompressionKind_NONE); + velox::common::CompressionKind::CompressionKind_NONE); if (*result && result->use_count() == 1) { VELOX_CHECK( *(*result)->type() == *type, @@ -232,7 +232,7 @@ void PrestoVectorSerde::serializeSingleColumn( const auto prestoOptions = toPrestoOptions(opts); VELOX_USER_CHECK_EQ( prestoOptions.compressionKind, - common::CompressionKind::CompressionKind_NONE); + velox::common::CompressionKind::CompressionKind_NONE); VELOX_USER_CHECK_EQ(prestoOptions.nullsFirst, false); const IndexRange range{0, vector->size()}; @@ -277,7 +277,7 @@ void PrestoVectorSerde::registerNamedVectorSerde() { "Lossless timestamps are not supported, because they cannot be decoded without the Schema")); VELOX_RETURN_IF( prestoOptions.compressionKind != - common::CompressionKind::CompressionKind_NONE, + velox::common::CompressionKind::CompressionKind_NONE, Status::Invalid("Compression is not supported")); VELOX_RETURN_IF( prestoOptions.nullsFirst, diff --git a/velox/serializers/PrestoSerializer.h b/velox/serializers/PrestoSerializer.h index a079e232b8a5..ff9139d1a960 100644 --- a/velox/serializers/PrestoSerializer.h +++ b/velox/serializers/PrestoSerializer.h @@ -54,7 +54,7 @@ class PrestoVectorSerde : public VectorSerde { PrestoOptions( bool _useLosslessTimestamp, - common::CompressionKind _compressionKind, + velox::common::CompressionKind _compressionKind, float _minCompressionRatio = 0.8, bool _nullsFirst = false, bool _preserveEncodings = false) diff --git a/velox/serializers/PrestoSerializerDeserializationUtils.cpp b/velox/serializers/PrestoSerializerDeserializationUtils.cpp index 30c0ae17a8ac..05c3458c2435 100644 --- a/velox/serializers/PrestoSerializerDeserializationUtils.cpp +++ b/velox/serializers/PrestoSerializerDeserializationUtils.cpp @@ -1357,7 +1357,7 @@ void readTopColumns( const auto& childTypes = type->asRow().children(); // Bug for bug compatibility: Extra columns at the end are allowed for // non-compressed data. - if (opts.compressionKind == common::CompressionKind_NONE) { + if (opts.compressionKind == velox::common::CompressionKind_NONE) { VELOX_USER_CHECK_GE( numColumns, type->size(), diff --git a/velox/serializers/RowSerializer.h b/velox/serializers/RowSerializer.h index 4ab11847cfb8..ee36185dc10f 100644 --- a/velox/serializers/RowSerializer.h +++ b/velox/serializers/RowSerializer.h @@ -47,7 +47,7 @@ class RowSerializer : public IterativeVectorSerializer { RowSerializer(memory::MemoryPool* pool, const VectorSerde::Options* options) : pool_(pool), options_(options == nullptr ? VectorSerde::Options() : *options), - codec_(common::compressionKindToCodec(options_.compressionKind)) {} + codec_(velox::common::compressionKindToCodec(options_.compressionKind)) {} void append( const RowVectorPtr& vector, @@ -394,13 +394,13 @@ class RowDeserializer { ? VectorSerde::Options().compressionKind : options->compressionKind; VELOX_DCHECK_NE( - compressionKind, common::CompressionKind::CompressionKind_NONE); + compressionKind, velox::common::CompressionKind::CompressionKind_NONE); auto compressBuf = folly::IOBuf::create(header.compressedSize); source->readBytes(compressBuf->writableData(), header.compressedSize); compressBuf->append(header.compressedSize); // Process chained uncompressed results IOBufs. - const auto codec = common::compressionKindToCodec(compressionKind); + const auto codec = velox::common::compressionKindToCodec(compressionKind); auto uncompressedBuf = codec->uncompress(compressBuf.get(), header.uncompressedSize); diff --git a/velox/serializers/tests/CompactRowSerializerTest.cpp b/velox/serializers/tests/CompactRowSerializerTest.cpp index 5bcfae91c3d4..38280f822038 100644 --- a/velox/serializers/tests/CompactRowSerializerTest.cpp +++ b/velox/serializers/tests/CompactRowSerializerTest.cpp @@ -25,12 +25,12 @@ namespace facebook::velox::serializer { namespace { struct TestParam { - common::CompressionKind compressionKind; + velox::common::CompressionKind compressionKind; bool appendRow; bool microBatchDeserialize; TestParam( - common::CompressionKind _compressionKind, + velox::common::CompressionKind _compressionKind, bool _appendRow, bool _microBatchDeserialize) : compressionKind(_compressionKind), @@ -272,10 +272,10 @@ class CompactRowSerializerTest : public ::testing::Test, private: bool needCompression() { - return compressionKind_ != common::CompressionKind::CompressionKind_NONE; + return compressionKind_ != velox::common::CompressionKind::CompressionKind_NONE; } - common::CompressionKind compressionKind_; + velox::common::CompressionKind compressionKind_; std::unique_ptr options_; bool appendRow_; bool microBatchDeserialize_; diff --git a/velox/serializers/tests/PrestoSerializerTest.cpp b/velox/serializers/tests/PrestoSerializerTest.cpp index c4f41f7d69f9..0845f2af2c64 100644 --- a/velox/serializers/tests/PrestoSerializerTest.cpp +++ b/velox/serializers/tests/PrestoSerializerTest.cpp @@ -74,7 +74,7 @@ class Foo { std::unordered_map> Foo::instances_; class PrestoSerializerTest - : public ::testing::TestWithParam, + : public ::testing::TestWithParam, public VectorTestBase { protected: static void SetUpTestSuite() { @@ -118,7 +118,7 @@ class PrestoSerializerTest serdeOptions) { const bool useLosslessTimestamp = serdeOptions == nullptr ? false : serdeOptions->useLosslessTimestamp; - common::CompressionKind kind = GetParam(); + velox::common::CompressionKind kind = GetParam(); const bool nullsFirst = serdeOptions == nullptr ? false : serdeOptions->nullsFirst; const bool preserveEncodings = @@ -189,7 +189,7 @@ class PrestoSerializerTest facebook::velox::serializer::presto::PrestoOutputStreamListener listener; OStreamOutputStream out(output, &listener); serializer->flush(&out); - if (paramOptions.compressionKind == common::CompressionKind_NONE) { + if (paramOptions.compressionKind == velox::common::CompressionKind_NONE) { EXPECT_EQ(size, out.tellp() - streamInitialSize); } else { EXPECT_GE(size, out.tellp() - streamInitialSize); @@ -216,7 +216,7 @@ class PrestoSerializerTest paramOptions) { if (paramOptions.useLosslessTimestamp || paramOptions.compressionKind != - common::CompressionKind::CompressionKind_NONE || + velox::common::CompressionKind::CompressionKind_NONE || paramOptions.nullsFirst) { // Unsupported options return; @@ -1132,7 +1132,7 @@ TEST_P(PrestoSerializerTest, timestampWithNanosecondPrecision) { // passed to the serde. const serializer::presto::PrestoVectorSerde::PrestoOptions kUseLosslessTimestampOptions( - true, common::CompressionKind::CompressionKind_NONE); + true, velox::common::CompressionKind::CompressionKind_NONE); auto timestamp = makeFlatVector( {Timestamp{0, 0}, Timestamp{12, 0}, @@ -1538,7 +1538,7 @@ TEST_P(PrestoSerializerTest, opaqueInteractiveVectorSerializer) { TEST_P(PrestoSerializerTest, encodedConcatenation) { // Slow test, run only for no compression. - if (GetParam() != common::CompressionKind::CompressionKind_NONE) { + if (GetParam() != velox::common::CompressionKind::CompressionKind_NONE) { return; } @@ -1589,7 +1589,7 @@ TEST_P(PrestoSerializerTest, encodedConcatenation) { TEST_P(PrestoSerializerTest, encodedConcatenation2) { // Slow test, run only for no compression. - if (GetParam() != common::CompressionKind::CompressionKind_NONE) { + if (GetParam() != velox::common::CompressionKind::CompressionKind_NONE) { return; } VectorFuzzer::Options options; @@ -1639,7 +1639,7 @@ TEST_P(PrestoSerializerTest, typeMismatch) { "number of columns requested for deserialization"); // TMore columns in serialization than in type. - if (GetParam() == common::CompressionKind_NONE) { + if (GetParam() == velox::common::CompressionKind_NONE) { // No throw. deserialize(ROW({BIGINT()}), serialized, nullptr); } else { @@ -1723,12 +1723,12 @@ INSTANTIATE_TEST_SUITE_P( PrestoSerializerTest, PrestoSerializerTest, ::testing::Values( - common::CompressionKind::CompressionKind_NONE, - common::CompressionKind::CompressionKind_ZLIB, - common::CompressionKind::CompressionKind_SNAPPY, - common::CompressionKind::CompressionKind_ZSTD, - common::CompressionKind::CompressionKind_LZ4, - common::CompressionKind::CompressionKind_GZIP)); + velox::common::CompressionKind::CompressionKind_NONE, + velox::common::CompressionKind::CompressionKind_ZLIB, + velox::common::CompressionKind::CompressionKind_SNAPPY, + velox::common::CompressionKind::CompressionKind_ZSTD, + velox::common::CompressionKind::CompressionKind_LZ4, + velox::common::CompressionKind::CompressionKind_GZIP)); TEST_F(PrestoSerializerTest, serdeSingleColumn) { // The difference between serialized data obtained from diff --git a/velox/serializers/tests/UnsafeRowSerializerTest.cpp b/velox/serializers/tests/UnsafeRowSerializerTest.cpp index 36642caf993e..7e0a62f8dd26 100644 --- a/velox/serializers/tests/UnsafeRowSerializerTest.cpp +++ b/velox/serializers/tests/UnsafeRowSerializerTest.cpp @@ -24,10 +24,10 @@ using namespace facebook; using namespace facebook::velox; struct TestParam { - common::CompressionKind compressionKind; + velox::common::CompressionKind compressionKind; bool appendRow; - TestParam(common::CompressionKind _compressionKind, bool _appendRow) + TestParam(velox::common::CompressionKind _compressionKind, bool _appendRow) : compressionKind(_compressionKind), appendRow(_appendRow) {} }; @@ -196,14 +196,14 @@ class UnsafeRowSerializerTest : public ::testing::Test, } bool needCompression() { - return compressionKind_ != common::CompressionKind::CompressionKind_NONE; + return compressionKind_ != velox::common::CompressionKind::CompressionKind_NONE; } std::shared_ptr pool_; private: static constexpr int32_t kHeaderSize = sizeof(int32_t) * 2 + sizeof(char); - common::CompressionKind compressionKind_; + velox::common::CompressionKind compressionKind_; std::unique_ptr options_; bool appendRow_; }; diff --git a/velox/substrait/SubstraitToVeloxPlan.cpp b/velox/substrait/SubstraitToVeloxPlan.cpp index 165da856597f..0ab4438faaf1 100644 --- a/velox/substrait/SubstraitToVeloxPlan.cpp +++ b/velox/substrait/SubstraitToVeloxPlan.cpp @@ -412,11 +412,11 @@ core::PlanNodePtr SubstraitVeloxPlanConverter::toVeloxPlan( kHiveConnectorId, "hive_table", filterPushdownEnabled, - common::SubfieldFilters{}, + velox::common::SubfieldFilters{}, nullptr, nullptr); } else { - common::SubfieldFilters filters = + velox::common::SubfieldFilters filters = toVeloxFilter(colNameList, veloxTypeList, readRel.filter()); tableHandle = std::make_shared( kHiveConnectorId, @@ -432,7 +432,7 @@ core::PlanNodePtr SubstraitVeloxPlanConverter::toVeloxPlan( outNames.reserve(colNameList.size()); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments; for (int idx = 0; idx < colNameList.size(); idx++) { auto outName = substraitParser_->makeNodeName(planNodeId_, idx); @@ -639,7 +639,7 @@ common::SubfieldFilters SubstraitVeloxPlanConverter::toVeloxFilter( const std::vector& inputNameList, const std::vector& inputTypeList, const ::substrait::Expression& substraitFilter) { - common::SubfieldFilters filters; + velox::common::SubfieldFilters filters; // A map betweesn the column index and the FilterInfo for that column. std::unordered_map> colInfoMap; for (int idx = 0; idx < inputNameList.size(); idx++) { @@ -716,7 +716,7 @@ common::SubfieldFilters SubstraitVeloxPlanConverter::toVeloxFilter( rightExclusive = filterInfo->rightExclusive_; } bool nullAllowed = filterInfo->nullAllowed_; - filters[common::Subfield(inputNameList[idx])] = + filters[velox::common::Subfield(inputNameList[idx])] = std::make_unique( leftBound, leftUnbounded, diff --git a/velox/substrait/SubstraitToVeloxPlan.h b/velox/substrait/SubstraitToVeloxPlan.h index efdd16b012a9..40ea984bfff5 100644 --- a/velox/substrait/SubstraitToVeloxPlan.h +++ b/velox/substrait/SubstraitToVeloxPlan.h @@ -125,7 +125,7 @@ class SubstraitVeloxPlanConverter { /// Used to convert Substrait Filter into Velox SubfieldFilters which will /// be used in TableScan. - common::SubfieldFilters toVeloxFilter( + velox::common::SubfieldFilters toVeloxFilter( const std::vector& inputNameList, const std::vector& inputTypeList, const ::substrait::Expression& substraitFilter); diff --git a/velox/substrait/tests/Substrait2VeloxPlanConversionTest.cpp b/velox/substrait/tests/Substrait2VeloxPlanConversionTest.cpp index bc330463d465..b80050c2d089 100644 --- a/velox/substrait/tests/Substrait2VeloxPlanConversionTest.cpp +++ b/velox/substrait/tests/Substrait2VeloxPlanConversionTest.cpp @@ -37,7 +37,7 @@ class Substrait2VeloxPlanConversionTest memory::MemoryManager::testingSetInstance(memory::MemoryManager::Options{}); } - std::vector> + std::vector> makeSplits( const facebook::velox::substrait::SubstraitVeloxPlanConverter& converter, std::shared_ptr planNode) { @@ -52,7 +52,7 @@ class Substrait2VeloxPlanConversionTest const auto& lengths = splitInfo->lengths; const auto fileFormat = splitInfo->format; - std::vector> + std::vector> splits; splits.reserve(paths.size()); diff --git a/velox/tool/trace/TableWriterReplayer.cpp b/velox/tool/trace/TableWriterReplayer.cpp index 8644765b4e51..fe5e2be5aa8a 100644 --- a/velox/tool/trace/TableWriterReplayer.cpp +++ b/velox/tool/trace/TableWriterReplayer.cpp @@ -37,16 +37,16 @@ makeHiveInsertTableHandle( node->insertTableHandle()->connectorInsertTableHandle()); const auto inputColumns = tracedHandle->inputColumns(); const auto compressionKind = - tracedHandle->compressionKind().value_or(common::CompressionKind_NONE); + tracedHandle->compressionKind().value_or(velox::common::CompressionKind_NONE); const auto storageFormat = tracedHandle->storageFormat(); const auto serdeParameters = tracedHandle->serdeParameters(); const auto writerOptions = tracedHandle->writerOptions(); return std::make_shared( inputColumns, - std::make_shared( + std::make_shared( targetDir, targetDir, - connector::hive::LocationHandle::TableType::kNew), + connector::common::LocationHandle::TableType::kNew), storageFormat, tracedHandle->bucketProperty() == nullptr ? nullptr diff --git a/velox/tool/trace/TraceReplayRunner.cpp b/velox/tool/trace/TraceReplayRunner.cpp index bd74855751e7..18071610aef1 100644 --- a/velox/tool/trace/TraceReplayRunner.cpp +++ b/velox/tool/trace/TraceReplayRunner.cpp @@ -268,7 +268,7 @@ void TraceReplayRunner::init() { core::PlanNode::registerSerDe(); core::ITypedExpr::registerSerDe(); - common::Filter::registerSerDe(); + velox::common::Filter::registerSerDe(); Type::registerSerDe(); exec::registerPartitionFunctionSerDe(); if (!isRegisteredVectorSerde()) { @@ -284,7 +284,7 @@ void TraceReplayRunner::init() { serializer::spark::UnsafeRowVectorSerde::registerNamedVectorSerde(); } connector::hive::HiveTableHandle::registerSerDe(); - connector::hive::LocationHandle::registerSerDe(); + connector::hive::HiveLocationHandle::registerSerDe(); connector::hive::HiveColumnHandle::registerSerDe(); connector::hive::HiveInsertTableHandle::registerSerDe(); connector::hive::HiveInsertFileNameGenerator::registerSerDe(); @@ -297,7 +297,7 @@ void TraceReplayRunner::init() { parse::registerTypeResolver(); if (!facebook::velox::connector::hasConnectorFactory("hive")) { - connector::registerConnectorFactory( + connector::common::registerConnectorFactory( std::make_shared()); } @@ -354,15 +354,15 @@ TraceReplayRunner::createReplayer() const { } else if (traceNodeName == "TableScan") { const auto connectorId = taskTraceMetadataReader_->connectorId(FLAGS_node_id); - if (const auto& collectors = connector::getAllConnectors(); + if (const auto& collectors = connector::common::getAllConnectors(); collectors.find(connectorId) == collectors.end()) { const auto hiveConnector = - connector::getConnectorFactory("hive")->newConnector( + connector::common::getConnectorFactory("hive")->newConnector( connectorId, std::make_shared( std::unordered_map()), ioExecutor_.get()); - connector::registerConnector(hiveConnector); + connector::common::registerConnector(hiveConnector); } replayer = std::make_unique( FLAGS_root_dir, diff --git a/velox/tool/trace/tests/AggregationReplayerTest.cpp b/velox/tool/trace/tests/AggregationReplayerTest.cpp index caaa7848ce38..fd9d2b0cdff7 100644 --- a/velox/tool/trace/tests/AggregationReplayerTest.cpp +++ b/velox/tool/trace/tests/AggregationReplayerTest.cpp @@ -65,9 +65,9 @@ class AggregationReplayerTest : public HiveConnectorTestBase { serializer::presto::PrestoVectorSerde::registerVectorSerde(); } Type::registerSerDe(); - common::Filter::registerSerDe(); + velox::common::Filter::registerSerDe(); connector::hive::HiveTableHandle::registerSerDe(); - connector::hive::LocationHandle::registerSerDe(); + connector::hive::HiveLocationHandle::registerSerDe(); connector::hive::HiveColumnHandle::registerSerDe(); connector::hive::HiveInsertTableHandle::registerSerDe(); connector::hive::HiveInsertFileNameGenerator::registerSerDe(); diff --git a/velox/tool/trace/tests/FilterProjectReplayerTest.cpp b/velox/tool/trace/tests/FilterProjectReplayerTest.cpp index d314c538f9a0..ec450269be1d 100644 --- a/velox/tool/trace/tests/FilterProjectReplayerTest.cpp +++ b/velox/tool/trace/tests/FilterProjectReplayerTest.cpp @@ -64,9 +64,9 @@ class FilterProjectReplayerTest : public HiveConnectorTestBase { serializer::presto::PrestoVectorSerde::registerVectorSerde(); } Type::registerSerDe(); - common::Filter::registerSerDe(); + velox::common::Filter::registerSerDe(); connector::hive::HiveTableHandle::registerSerDe(); - connector::hive::LocationHandle::registerSerDe(); + connector::hive::HiveLocationHandle::registerSerDe(); connector::hive::HiveColumnHandle::registerSerDe(); connector::hive::HiveInsertTableHandle::registerSerDe(); connector::hive::HiveInsertFileNameGenerator::registerSerDe(); diff --git a/velox/tool/trace/tests/HashJoinReplayerTest.cpp b/velox/tool/trace/tests/HashJoinReplayerTest.cpp index f3328963b612..76dd3118e95e 100644 --- a/velox/tool/trace/tests/HashJoinReplayerTest.cpp +++ b/velox/tool/trace/tests/HashJoinReplayerTest.cpp @@ -65,9 +65,9 @@ class HashJoinReplayerTest : public HiveConnectorTestBase { serializer::presto::PrestoVectorSerde::registerVectorSerde(); } Type::registerSerDe(); - common::Filter::registerSerDe(); + velox::common::Filter::registerSerDe(); connector::hive::HiveTableHandle::registerSerDe(); - connector::hive::LocationHandle::registerSerDe(); + connector::hive::HiveLocationHandle::registerSerDe(); connector::hive::HiveColumnHandle::registerSerDe(); connector::hive::HiveInsertTableHandle::registerSerDe(); connector::hive::HiveInsertFileNameGenerator::registerSerDe(); diff --git a/velox/tool/trace/tests/PartitionedOutputReplayerTest.cpp b/velox/tool/trace/tests/PartitionedOutputReplayerTest.cpp index 584cbbe99f92..640b1a00ff6c 100644 --- a/velox/tool/trace/tests/PartitionedOutputReplayerTest.cpp +++ b/velox/tool/trace/tests/PartitionedOutputReplayerTest.cpp @@ -57,9 +57,9 @@ class PartitionedOutputReplayerTest serializer::presto::PrestoVectorSerde::registerVectorSerde(); } Type::registerSerDe(); - common::Filter::registerSerDe(); + velox::common::Filter::registerSerDe(); connector::hive::HiveTableHandle::registerSerDe(); - connector::hive::LocationHandle::registerSerDe(); + connector::hive::HiveLocationHandle::registerSerDe(); connector::hive::HiveColumnHandle::registerSerDe(); connector::hive::HiveInsertTableHandle::registerSerDe(); connector::hive::HiveInsertFileNameGenerator::registerSerDe(); diff --git a/velox/tool/trace/tests/TableScanReplayerTest.cpp b/velox/tool/trace/tests/TableScanReplayerTest.cpp index 0aa8b708e25e..a57eb49223ee 100644 --- a/velox/tool/trace/tests/TableScanReplayerTest.cpp +++ b/velox/tool/trace/tests/TableScanReplayerTest.cpp @@ -59,9 +59,9 @@ class TableScanReplayerTest : public HiveConnectorTestBase { serializer::presto::PrestoVectorSerde::registerVectorSerde(); } Type::registerSerDe(); - common::Filter::registerSerDe(); + velox::common::Filter::registerSerDe(); connector::hive::HiveTableHandle::registerSerDe(); - connector::hive::LocationHandle::registerSerDe(); + connector::hive::HiveLocationHandle::registerSerDe(); connector::hive::HiveColumnHandle::registerSerDe(); connector::hive::HiveInsertTableHandle::registerSerDe(); connector::hive::HiveInsertFileNameGenerator::registerSerDe(); @@ -298,11 +298,11 @@ TEST_F(TableScanReplayerTest, subfieldPrunning) { auto vectors = makeVectors(10, 1'000, rowType); auto filePath = TempFilePath::create(); writeToFile(filePath->getPath(), vectors); - std::vector requiredSubfields; + std::vector requiredSubfields; requiredSubfields.emplace_back("e.c"); std::unordered_map< std::string, - std::shared_ptr> + std::shared_ptr> assignments; assignments["e"] = std::make_shared( "e", diff --git a/velox/tool/trace/tests/TableWriterReplayerTest.cpp b/velox/tool/trace/tests/TableWriterReplayerTest.cpp index 1e378e85c157..edf8acf192e5 100644 --- a/velox/tool/trace/tests/TableWriterReplayerTest.cpp +++ b/velox/tool/trace/tests/TableWriterReplayerTest.cpp @@ -59,9 +59,9 @@ class TableWriterReplayerTest : public HiveConnectorTestBase { serializer::presto::PrestoVectorSerde::registerVectorSerde(); } Type::registerSerDe(); - common::Filter::registerSerDe(); + velox::common::Filter::registerSerDe(); connector::hive::HiveTableHandle::registerSerDe(); - connector::hive::LocationHandle::registerSerDe(); + connector::hive::HiveLocationHandle::registerSerDe(); connector::hive::HiveColumnHandle::registerSerDe(); connector::hive::HiveInsertTableHandle::registerSerDe(); connector::hive::HiveInsertFileNameGenerator::registerSerDe(); @@ -95,7 +95,7 @@ class TableWriterReplayerTest : public HiveConnectorTestBase { // Helper method to return InsertTableHandle. std::shared_ptr createInsertTableHandle( const RowTypePtr& outputRowType, - const connector::hive::LocationHandle::TableType& outputTableType, + const connector::common::LocationHandle::TableType& outputTableType, const std::string& outputDirectoryPath, const std::vector& partitionedBy, const std::shared_ptr bucketProperty, @@ -106,11 +106,8 @@ class TableWriterReplayerTest : public HiveConnectorTestBase { outputRowType->names(), outputRowType->children(), partitionedBy, - bucketProperty, makeLocationHandle( - outputDirectoryPath, std::nullopt, outputTableType), - fileFormat_, - compressionKind)); + outputDirectoryPath, std::nullopt, outputTableType),); } // Returns a table insert plan node. @@ -122,9 +119,9 @@ class TableWriterReplayerTest : public HiveConnectorTestBase { const std::vector& partitionedBy = {}, std::shared_ptr bucketProperty = nullptr, const std::optional compressionKind = {}, - const connector::hive::LocationHandle::TableType& outputTableType = - connector::hive::LocationHandle::TableType::kNew, - const CommitStrategy& outputCommitStrategy = CommitStrategy::kNoCommit, + const connector::common::LocationHandle::TableType& outputTableType = + connector::common::LocationHandle::TableType::kNew, + const connector::common::CommitStrategy& outputCommitStrategy = connector::common::CommitStrategy::kNoCommit, bool aggregateResult = true, std::shared_ptr aggregationNode = nullptr) { auto insertPlan = inputPlan @@ -157,8 +154,8 @@ class TableWriterReplayerTest : public HiveConnectorTestBase { const std::shared_ptr& aggregationNode, const std::shared_ptr& insertHandle, bool hasPartitioningScheme, - connector::CommitStrategy commitStrategy = - connector::CommitStrategy::kNoCommit) { + connector::common::CommitStrategy commitStrategy = + connector::common::CommitStrategy::kNoCommit) { return [=](core::PlanNodeId nodeId, core::PlanNodePtr source) -> core::PlanNodePtr { std::shared_ptr aggNode = nullptr; @@ -199,9 +196,9 @@ class TableWriterReplayerTest : public HiveConnectorTestBase { return ROW(std::move(dataColumnNames), std::move(dataColumnTypes)); } - std::vector> + std::vector> makeHiveSplitsFromDirectory(const std::string& directoryPath) { - std::vector> splits; + std::vector> splits; for (auto& path : fs::recursive_directory_iterator(directoryPath)) { if (path.is_regular_file()) { diff --git a/velox/tool/trace/tests/TraceFileToolTest.cpp b/velox/tool/trace/tests/TraceFileToolTest.cpp index 2a497d9a34bd..8ff8e4fcaec5 100644 --- a/velox/tool/trace/tests/TraceFileToolTest.cpp +++ b/velox/tool/trace/tests/TraceFileToolTest.cpp @@ -63,9 +63,9 @@ class TraceFileToolTest : public HiveConnectorTestBase { serializer::presto::PrestoVectorSerde::registerVectorSerde(); } Type::registerSerDe(); - common::Filter::registerSerDe(); + velox::common::Filter::registerSerDe(); connector::hive::HiveTableHandle::registerSerDe(); - connector::hive::LocationHandle::registerSerDe(); + connector::hive::HiveLocationHandle::registerSerDe(); connector::hive::HiveColumnHandle::registerSerDe(); connector::hive::HiveInsertTableHandle::registerSerDe(); connector::hive::HiveConnectorSplit::registerSerDe(); diff --git a/velox/type/tests/FilterSerDeTest.cpp b/velox/type/tests/FilterSerDeTest.cpp index 0b4c52c1a902..900a54be7a56 100644 --- a/velox/type/tests/FilterSerDeTest.cpp +++ b/velox/type/tests/FilterSerDeTest.cpp @@ -141,7 +141,7 @@ TEST_F(FilterSerDeTest, multiRangeFilter) { } TEST_F(FilterSerDeTest, multiFilter) { - std::vector> filters; + std::vector> filters; filters.emplace_back(std::make_unique()); filters.emplace_back(std::make_unique(false, true)); filters.emplace_back( diff --git a/velox/type/tests/NegatedBytesRangeBenchmark.cpp b/velox/type/tests/NegatedBytesRangeBenchmark.cpp index 8459e6db74e6..3d9777651261 100644 --- a/velox/type/tests/NegatedBytesRangeBenchmark.cpp +++ b/velox/type/tests/NegatedBytesRangeBenchmark.cpp @@ -164,13 +164,13 @@ int32_t main(int32_t argc, char** argv) { lo, false, false, hi, false, true, false)); // create MultiRange filter - std::vector> rangeFilters; + std::vector> rangeFilters; rangeFilters.emplace_back(std::make_unique( "", true, false, lo, false, true, false)); rangeFilters.emplace_back(std::make_unique( hi, false, false, "", true, false, false)); multiRanges.emplace_back( - std::make_unique(std::move(rangeFilters), false)); + std::make_unique(std::move(rangeFilters), false)); LOG(INFO) << "Generated filter for length " << len << " with percentage " << pct; diff --git a/velox/type/tests/NegatedBytesValuesBenchmark.cpp b/velox/type/tests/NegatedBytesValuesBenchmark.cpp index 78da7f714dca..1b5dcc1baa73 100644 --- a/velox/type/tests/NegatedBytesValuesBenchmark.cpp +++ b/velox/type/tests/NegatedBytesValuesBenchmark.cpp @@ -164,7 +164,7 @@ int32_t main(int32_t argc, char* argv[]) { std::make_unique(reject_vector, false)); // create MultiRange filter - std::vector> range_filters; + std::vector> range_filters; auto front = ++(reject_vector.begin()); auto back = reject_vector.begin(); range_filters.emplace_back(std::make_unique( @@ -177,7 +177,7 @@ int32_t main(int32_t argc, char* argv[]) { } range_filters.emplace_back(std::make_unique( *back, false, true, "", true, true, false)); - multi_ranges.emplace_back(std::make_unique( + multi_ranges.emplace_back(std::make_unique( std::move(range_filters), false)); LOG(INFO) << "Generated filter for length " << len << " with size " diff --git a/velox/type/tests/SubfieldFiltersBuilder.h b/velox/type/tests/SubfieldFiltersBuilder.h index 851b890a3509..703888b21d31 100644 --- a/velox/type/tests/SubfieldFiltersBuilder.h +++ b/velox/type/tests/SubfieldFiltersBuilder.h @@ -23,8 +23,8 @@ class SubfieldFiltersBuilder { public: SubfieldFiltersBuilder& add( const std::string& path, - std::unique_ptr filter) { - filters_[common::Subfield(path)] = std::move(filter); + std::unique_ptr filter) { + filters_[velox::common::Subfield(path)] = std::move(filter); return *this; } @@ -38,7 +38,7 @@ class SubfieldFiltersBuilder { inline SubfieldFilters singleSubfieldFilter( const std::string& path, - std::unique_ptr filter) { + std::unique_ptr filter) { return SubfieldFiltersBuilder().add(path, std::move(filter)).build(); } } // namespace facebook::velox::common::test diff --git a/velox/type/tests/TimestampTest.cpp b/velox/type/tests/TimestampTest.cpp index 44f1ffc9e427..6539c3053502 100644 --- a/velox/type/tests/TimestampTest.cpp +++ b/velox/type/tests/TimestampTest.cpp @@ -301,7 +301,7 @@ bool checkUtcToEpoch(int year, int mon, int mday, int hour, int min, int sec) { } // namespace TEST(TimestampTest, compareWithToStringAlt) { - std::default_random_engine gen(common::testutil::getRandomSeed(42)); + std::default_random_engine gen(velox::common::testutil::getRandomSeed(42)); std::uniform_int_distribution distSec( Timestamp::kMinSeconds, Timestamp::kMaxSeconds); std::uniform_int_distribution distNano(0, Timestamp::kMaxNanos); @@ -342,7 +342,7 @@ TEST(TimestampTest, utcToEpoch) { } TEST(TimestampTest, utcToEpochRandomInputs) { - std::default_random_engine gen(common::testutil::getRandomSeed(42)); + std::default_random_engine gen(velox::common::testutil::getRandomSeed(42)); std::uniform_int_distribution dist(INT32_MIN, INT32_MAX); for (int i = 0; i < 10'000; ++i) { checkUtcToEpoch( diff --git a/velox/type/tz/tests/TimeZoneMapExternalInvalidTest.cpp b/velox/type/tz/tests/TimeZoneMapExternalInvalidTest.cpp index c18512921e1f..47734e8ca7f8 100644 --- a/velox/type/tz/tests/TimeZoneMapExternalInvalidTest.cpp +++ b/velox/type/tz/tests/TimeZoneMapExternalInvalidTest.cpp @@ -29,7 +29,7 @@ using namespace std::chrono; DEBUG_ONLY_TEST(TimeZoneMapExternalInvalidTest, externalInvalid) { const int16_t testZoneId = 1681; const std::string testZone = "Africa/Abidjan"; - common::testutil::TestValue::enable(); + velox::common::testutil::TestValue::enable(); SCOPED_TESTVALUE_SET( "facebook::velox::tz::locateZoneImpl", std::function( diff --git a/velox/vector/VectorStream.h b/velox/vector/VectorStream.h index 3219ed095f6f..54bd32f317b4 100644 --- a/velox/vector/VectorStream.h +++ b/velox/vector/VectorStream.h @@ -218,15 +218,15 @@ class VectorSerde { Options() = default; Options( - common::CompressionKind _compressionKind, + velox::common::CompressionKind _compressionKind, float _minCompressionRatio) : compressionKind(_compressionKind), minCompressionRatio(_minCompressionRatio) {} virtual ~Options() = default; - common::CompressionKind compressionKind{ - common::CompressionKind::CompressionKind_NONE}; + velox::common::CompressionKind compressionKind{ + velox::common::CompressionKind::CompressionKind_NONE}; /// Minimum achieved compression if compression is enabled. Compressing less /// than this causes subsequent compression attempts to be skipped. The more /// times compression misses the target the less frequently it is tried. diff --git a/velox/vector/tests/EncodedVectorCopyTest.cpp b/velox/vector/tests/EncodedVectorCopyTest.cpp index b005e0998a9d..9c793b9ea281 100644 --- a/velox/vector/tests/EncodedVectorCopyTest.cpp +++ b/velox/vector/tests/EncodedVectorCopyTest.cpp @@ -617,7 +617,7 @@ TEST_P(EncodedVectorCopyTest, fuzzer) { VectorFuzzer::Options fuzzerOptions; fuzzerOptions.allowLazyVector = reuseSource(); fuzzerOptions.nullRatio = 0.05; - auto seed = common::testutil::getRandomSeed(42); + auto seed = velox::common::testutil::getRandomSeed(42); VectorFuzzer fuzzer(fuzzerOptions, pool(), seed); fuzzer::FuzzerGenerator rng(seed); #ifndef NDEBUG diff --git a/velox/vector/tests/VectorMakerTest.cpp b/velox/vector/tests/VectorMakerTest.cpp index 083ba0219d7b..801191ff9cd4 100644 --- a/velox/vector/tests/VectorMakerTest.cpp +++ b/velox/vector/tests/VectorMakerTest.cpp @@ -480,13 +480,13 @@ TEST_F(VectorMakerTest, nestedArrayVectorFromJson) { {{2, 3, 4}}, {{std::nullopt, 7}}, {{1, 3, 7, 9}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{std::nullopt}}, {{1, 2, std::nullopt}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, std::nullopt, {{1, 2, 3}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{4, 5}}, }); diff --git a/velox/vector/tests/VectorSaverTest.cpp b/velox/vector/tests/VectorSaverTest.cpp index 11cdd8fac403..47ce7d64fdb2 100644 --- a/velox/vector/tests/VectorSaverTest.cpp +++ b/velox/vector/tests/VectorSaverTest.cpp @@ -648,7 +648,7 @@ TEST_F(VectorSaverTest, exceptionContext) { auto messageFunction = [](VeloxException::Type /*exceptionType*/, auto* arg) -> std::string { auto* info = static_cast(arg); - auto filePath = common::generateTempFilePath(info->path, "vector"); + auto filePath = velox::common::generateTempFilePath(info->path, "vector"); if (!filePath.has_value()) { return "Cannot generate file path to store the vector."; } diff --git a/velox/vector/tests/VectorTest.cpp b/velox/vector/tests/VectorTest.cpp index 2ca313f52ffa..a9d8f56465cc 100644 --- a/velox/vector/tests/VectorTest.cpp +++ b/velox/vector/tests/VectorTest.cpp @@ -3197,7 +3197,7 @@ TEST_F(VectorTest, containsNullAtArrays) { auto data = makeNullableArrayVector({ {{1, 2}}, {{1, 2, std::nullopt, 3}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, std::nullopt, {{1, 2, 3, 4}}, }); @@ -3214,7 +3214,7 @@ TEST_F(VectorTest, containsNullAtMaps) { {{{1, 10}, {2, 20}}}, {{{3, 30}}}, {{{1, 10}, {2, 20}, {3, std::nullopt}, {4, 40}}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, std::nullopt, {{{1, 10}, {2, 20}, {3, 30}, {4, 40}}}, }); @@ -3250,7 +3250,7 @@ TEST_F(VectorTest, containsNullAtStructs) { makeNullableArrayVector({ {{1, 2}}, {{1, 2, std::nullopt, 3}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{1, 2, 3}}, std::nullopt, {{1, 2, 3, 4, 5}}, @@ -3728,7 +3728,7 @@ TEST_F(VectorTest, getLargeStringBuffer) { TEST_F(VectorTest, mapUpdate) { auto base = makeNullableMapVector({ {{{1, 1}, {2, 1}}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{{3, 1}}}, std::nullopt, {{{4, 1}}}, @@ -3736,7 +3736,7 @@ TEST_F(VectorTest, mapUpdate) { auto update = makeNullableMapVector({ {{{2, 2}, {3, 2}}}, {{{4, 2}}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{{5, 2}}}, std::nullopt, }); @@ -3803,7 +3803,7 @@ TEST_F(VectorTest, mapUpdateNullMapValue) { TEST_F(VectorTest, mapUpdateMultipleUpdates) { auto base = makeNullableMapVector({ {{{1, 1}, {2, 1}}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{{3, 1}}}, std::nullopt, {{{4, 1}}}, @@ -3812,16 +3812,16 @@ TEST_F(VectorTest, mapUpdateMultipleUpdates) { makeNullableMapVector({ {{{2, 2}, {3, 2}}}, {{{4, 2}}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{{5, 2}}}, std::nullopt, }), makeNullableMapVector({ {{{3, 3}, {4, 3}}}, std::nullopt, - common::testutil::optionalEmpty, - common::testutil::optionalEmpty, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, }), }; auto expected = makeNullableMapVector({ @@ -3841,7 +3841,7 @@ TEST_F(VectorTest, mapUpdateMultipleUpdates) { TEST_F(VectorTest, mapUpdateConstant) { auto base = makeNullableMapVector({ {{{1, 1}, {2, 1}}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{{3, 1}}}, std::nullopt, {{{4, 1}}}, @@ -3863,7 +3863,7 @@ TEST_F(VectorTest, mapUpdateConstant) { TEST_F(VectorTest, mapUpdateDictionary) { auto base = makeNullableMapVector({ {{{1, 1}, {2, 1}}}, - common::testutil::optionalEmpty, + velox::common::testutil::optionalEmpty, {{{3, 1}}}, std::nullopt, {{{4, 1}}},