Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 58 additions & 2 deletions c_glib/arrow-glib/basic-array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,22 @@ garrow_array_statistics_has_null_count(GArrowArrayStatistics *statistics)
return priv->statistics.null_count.has_value();
}

/**
* garrow_array_statistics_is_null_count_exact:
* @statistics: A #GArrowArrayStatistics.
*
* Returns: %TRUE if the null count is available and exact, %FALSE otherwise.
*
* Since: 23.0.0
*/
gboolean
garrow_array_statistics_is_null_count_exact(GArrowArrayStatistics *statistics)
{
auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics);
return priv->statistics.null_count.has_value() &&
std::holds_alternative<int64_t>(*priv->statistics.null_count);
}

/**
* garrow_array_statistics_get_null_count:
* @statistics: A #GArrowArrayStatistics.
Expand All @@ -464,19 +480,59 @@ garrow_array_statistics_has_null_count(GArrowArrayStatistics *statistics)
* -1 otherwise.
*
* Since: 20.0.0
*
* Deprecated: 23.0.0. Use garrow_array_statistics_is_null_count_exact(),
* garrow_array_statistics_get_null_count_exact() and
* garrow_array_statistics_get_null_count_approximate() instead.
*/
gint64
garrow_array_statistics_get_null_count(GArrowArrayStatistics *statistics)
{
return garrow_array_statistics_get_null_count_exact(statistics);
}

/**
* garrow_array_statistics_get_null_count_exact:
* @statistics: A #GArrowArrayStatistics.
*
* Returns: 0 or larger value if @statistics has a valid exact null
* count value, -1 otherwise.
*
* Since: 23.0.0
*/
gint64
garrow_array_statistics_get_null_count_exact(GArrowArrayStatistics *statistics)
{
auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics);
const auto &null_count = priv->statistics.null_count;
if (null_count) {
return null_count.value();
if (null_count && std::holds_alternative<int64_t>(*null_count)) {
return std::get<int64_t>(*null_count);
} else {
return -1;
}
}

/**
* garrow_array_statistics_get_null_count_approximate:
* @statistics: A #GArrowArrayStatistics.
*
* Returns: Non `NaN` value if @statistics has a valid approximate
* null count value, `NaN` otherwise.
*
* Since: 23.0.0
*/
gdouble
garrow_array_statistics_get_null_count_approximate(GArrowArrayStatistics *statistics)
{
auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics);
const auto &null_count = priv->statistics.null_count;
if (null_count && std::holds_alternative<double>(*null_count)) {
return std::get<double>(*null_count);
} else {
return std::nan("");
}
}

/**
* garrow_array_statistics_has_distinct_count:
* @statistics: A #GArrowArrayStatistics.
Expand Down
12 changes: 12 additions & 0 deletions c_glib/arrow-glib/basic-array.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,21 @@ struct _GArrowArrayStatisticsClass
GARROW_AVAILABLE_IN_20_0
gboolean
garrow_array_statistics_has_null_count(GArrowArrayStatistics *statistics);
GARROW_AVAILABLE_IN_23_0
gboolean
garrow_array_statistics_is_null_count_exact(GArrowArrayStatistics *statistics);
#ifndef GARROW_DISABLE_DEPRECATED
GARROW_AVAILABLE_IN_20_0
GARROW_DEPRECATED_IN_23_0_FOR(garrow_array_statistics_get_null_count_exact)
gint64
garrow_array_statistics_get_null_count(GArrowArrayStatistics *statistics);
#endif
GARROW_AVAILABLE_IN_23_0
gint64
garrow_array_statistics_get_null_count_exact(GArrowArrayStatistics *statistics);
GARROW_AVAILABLE_IN_23_0
gdouble
garrow_array_statistics_get_null_count_approximate(GArrowArrayStatistics *statistics);

GARROW_AVAILABLE_IN_21_0
gboolean
Expand Down
16 changes: 14 additions & 2 deletions c_glib/test/test-array-statistics.rb
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,20 @@ def setup
end
end

test("#null_count") do
assert_equal(1, @statistics.null_count)
test("#null_count_exact?") do
assert do
@statistics.null_count_exact?
end
end

test("#null_count_exact") do
assert_equal(1, @statistics.null_count_exact)
end

test("#null_count_approximate") do
assert do
@statistics.null_count_approximate.nan?
end
end

test("#has_distinct_count?") do
Expand Down
12 changes: 6 additions & 6 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3934,7 +3934,7 @@ class TestArrayDataStatistics : public ::testing::Test {

protected:
std::vector<uint8_t> valids_;
size_t null_count_;
int64_t null_count_;
double distinct_count_;
double max_byte_width_;
double average_byte_width_;
Expand All @@ -3951,7 +3951,7 @@ TEST_F(TestArrayDataStatistics, MoveConstructor) {
ArrayData moved_data(std::move(copied_data));

ASSERT_TRUE(moved_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
ASSERT_EQ(null_count_, std::get<int64_t>(moved_data.statistics->null_count.value()));

ASSERT_TRUE(moved_data.statistics->distinct_count.has_value());
ASSERT_DOUBLE_EQ(distinct_count_,
Expand Down Expand Up @@ -3981,7 +3981,7 @@ TEST_F(TestArrayDataStatistics, CopyConstructor) {
ArrayData copied_data(*data_);

ASSERT_TRUE(copied_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
ASSERT_EQ(null_count_, std::get<int64_t>(copied_data.statistics->null_count.value()));

ASSERT_TRUE(copied_data.statistics->distinct_count.has_value());
ASSERT_DOUBLE_EQ(distinct_count_,
Expand Down Expand Up @@ -4013,7 +4013,7 @@ TEST_F(TestArrayDataStatistics, MoveAssignment) {
moved_data = std::move(copied_data);

ASSERT_TRUE(moved_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
ASSERT_EQ(null_count_, std::get<int64_t>(moved_data.statistics->null_count.value()));

ASSERT_TRUE(moved_data.statistics->distinct_count.has_value());
ASSERT_DOUBLE_EQ(distinct_count_,
Expand Down Expand Up @@ -4044,7 +4044,7 @@ TEST_F(TestArrayDataStatistics, CopyAssignment) {
copied_data = *data_;

ASSERT_TRUE(copied_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
ASSERT_EQ(null_count_, std::get<int64_t>(copied_data.statistics->null_count.value()));

ASSERT_TRUE(copied_data.statistics->distinct_count.has_value());
ASSERT_DOUBLE_EQ(distinct_count_,
Expand Down Expand Up @@ -4075,7 +4075,7 @@ TEST_F(TestArrayDataStatistics, CopyTo) {
data_->CopyTo(arrow::default_cpu_memory_manager()));

ASSERT_TRUE(copied_data->statistics->null_count.has_value());
ASSERT_EQ(null_count_, copied_data->statistics->null_count.value());
ASSERT_EQ(null_count_, std::get<int64_t>(copied_data->statistics->null_count.value()));

ASSERT_TRUE(copied_data->statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data->statistics->min.value()));
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/arrow/array/statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ struct ARROW_EXPORT ArrayStatistics {
}

/// \brief The number of null values, may not be set
std::optional<int64_t> null_count = std::nullopt;
/// Note: when set to `int64_t`, it represents `exact_null_count`,
/// and when set to `double`, it represents `approximate_null_count`.
std::optional<CountType> null_count = std::nullopt;

/// \brief The number of distinct values, may not be set
/// Note: when set to `int64_t`, it represents `exact_distinct_count`,
Expand Down
19 changes: 17 additions & 2 deletions cpp/src/arrow/array/statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,20 @@

namespace arrow {

TEST(TestArrayStatistics, NullCount) {
TEST(TestArrayStatistics, NullCountExact) {
ArrayStatistics statistics;
ASSERT_FALSE(statistics.null_count.has_value());
statistics.null_count = 29;
ASSERT_TRUE(statistics.null_count.has_value());
ASSERT_EQ(29, statistics.null_count.value());
ASSERT_EQ(29, std::get<int64_t>(statistics.null_count.value()));
}

TEST(TestArrayStatistics, NullCountApproximate) {
ArrayStatistics statistics;
ASSERT_FALSE(statistics.null_count.has_value());
statistics.null_count = 29.0;
ASSERT_TRUE(statistics.null_count.has_value());
ASSERT_DOUBLE_EQ(29.0, std::get<double>(statistics.null_count.value()));
}

TEST(TestArrayStatistics, DistinctCountExact) {
Expand Down Expand Up @@ -106,11 +114,18 @@ TEST(TestArrayStatistics, Equals) {

ASSERT_EQ(statistics1, statistics2);

// Test NULL_COUNT_EXACT
statistics1.null_count = 29;
ASSERT_NE(statistics1, statistics2);
statistics2.null_count = 29;
ASSERT_EQ(statistics1, statistics2);

// Test NULL_COUNT_APPROXIMATE
statistics1.null_count = 29.0;
ASSERT_NE(statistics1, statistics2);
statistics2.null_count = 29.0;
ASSERT_EQ(statistics1, statistics2);

// Test DISTINCT_COUNT_EXACT
statistics1.distinct_count = static_cast<int64_t>(2929);
ASSERT_NE(statistics1, statistics2);
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/arrow/compare.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1563,7 +1563,8 @@ bool ArrayStatisticsOptionalValueEquals(const std::optional<Type>& left,

bool ArrayStatisticsEqualsImpl(const ArrayStatistics& left, const ArrayStatistics& right,
const EqualOptions& equal_options) {
return left.null_count == right.null_count &&
return ArrayStatisticsOptionalValueEquals(left.null_count, right.null_count,
equal_options) &&
ArrayStatisticsOptionalValueEquals(left.distinct_count, right.distinct_count,
equal_options) &&
ArrayStatisticsOptionalValueEquals(left.max_byte_width, right.max_byte_width,
Expand Down
15 changes: 11 additions & 4 deletions cpp/src/arrow/record_batch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -536,10 +536,17 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat
statistics.nth_column = nth_column;
if (column_statistics->null_count.has_value()) {
statistics.nth_statistics++;
statistics.key = ARROW_STATISTICS_KEY_NULL_COUNT_EXACT;
statistics.type = int64();
statistics.value = column_statistics->null_count.value();
RETURN_NOT_OK(on_statistics(statistics));
if (std::holds_alternative<int64_t>(column_statistics->null_count.value())) {
statistics.key = ARROW_STATISTICS_KEY_NULL_COUNT_EXACT;
statistics.type = int64();
statistics.value = std::get<int64_t>(column_statistics->null_count.value());
RETURN_NOT_OK(on_statistics(statistics));
} else {
statistics.key = ARROW_STATISTICS_KEY_NULL_COUNT_APPROXIMATE;
statistics.type = float64();
statistics.value = std::get<double>(column_statistics->null_count.value());
RETURN_NOT_OK(on_statistics(statistics));
}
statistics.start_new_column = false;
}

Expand Down
33 changes: 32 additions & 1 deletion cpp/src/arrow/record_batch_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1456,7 +1456,7 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayRowCount) {
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
}

TEST_F(TestRecordBatch, MakeStatisticsArrayNullCount) {
TEST_F(TestRecordBatch, MakeStatisticsArrayNullCountExact) {
auto schema =
::arrow::schema({field("no-statistics", boolean()), field("int32", int32())});
auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
Expand Down Expand Up @@ -1486,6 +1486,37 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayNullCount) {
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
}

TEST_F(TestRecordBatch, MakeStatisticsArrayNullCountApproximate) {
auto schema =
::arrow::schema({field("no-statistics", boolean()), field("int32", int32())});
auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
auto int32_array_data = ArrayFromJSON(int32(), "[1, null, -1]")->data()->Copy();
int32_array_data->statistics = std::make_shared<ArrayStatistics>();
int32_array_data->statistics->null_count = 1.0;
auto int32_array = MakeArray(std::move(int32_array_data));
auto batch = RecordBatch::Make(schema, int32_array->length(),
{no_statistics_array, int32_array});

ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());

ASSERT_OK_AND_ASSIGN(
auto expected_statistics_array,
MakeStatisticsArray("[null, 1]",
{{
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
},
{
ARROW_STATISTICS_KEY_NULL_COUNT_APPROXIMATE,
}},
{{
ArrayStatistics::ValueType{int64_t{3}},
},
{
ArrayStatistics::ValueType{1.0},
}}));
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
}

TEST_F(TestRecordBatch, MakeStatisticsArrayDistinctCountExact) {
auto schema =
::arrow::schema({field("no-statistics", boolean()), field("int32", int32())});
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/parquet/arrow/arrow_statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,8 @@ void TestStatisticsReadArray(std::shared_ptr<::arrow::DataType> arrow_type) {
auto statistics = typed_read_array->statistics();
ASSERT_NE(nullptr, statistics);
ASSERT_EQ(true, statistics->null_count.has_value());
ASSERT_EQ(1, statistics->null_count.value());
ASSERT_EQ(true, std::holds_alternative<int64_t>(statistics->null_count.value()));
ASSERT_EQ(1, std::get<int64_t>(statistics->null_count.value()));
ASSERT_EQ(false, statistics->distinct_count.has_value());
ASSERT_EQ(true, statistics->min.has_value());
ASSERT_EQ(true, std::holds_alternative<MinMaxType>(*statistics->min));
Expand Down Expand Up @@ -356,7 +357,8 @@ TEST(TestStatisticsRead, MultipleRowGroupsShouldLoadStatistics) {
auto statistics = typed_read_array->statistics();
ASSERT_NE(nullptr, statistics);
ASSERT_EQ(true, statistics->null_count.has_value());
ASSERT_EQ(1, statistics->null_count.value());
ASSERT_EQ(true, std::holds_alternative<int64_t>(statistics->null_count.value()));
ASSERT_EQ(1, std::get<int64_t>(statistics->null_count.value()));
ASSERT_EQ(false, statistics->distinct_count.has_value());
ASSERT_EQ(true, statistics->min.has_value());
// This is not -1 because this array has only the first 2 elements.
Expand Down
20 changes: 17 additions & 3 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -766,10 +766,24 @@ cdef class ArrayStatistics(_Weakrefable):
null_count = self.sp_statistics.get().null_count
# We'll be able to simplify this after
# https://github.com/cython/cython/issues/6692 is solved.
if null_count.has_value():
return null_count.value()
else:
if not null_count.has_value():
return None
value = null_count.value()
if holds_alternative[int64_t](value):
return get[int64_t](value)
else:
return get[double](value)

@property
def is_null_count_exact(self):
"""
Whether the number of null values is a valid exact value or not.
"""
null_count = self.sp_statistics.get().null_count
if not null_count.has_value():
return False
value = null_count.value()
return holds_alternative[int64_t](value)

@property
def distinct_count(self):
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
c_bool is_numeric(Type type)

cdef cppclass CArrayStatistics" arrow::ArrayStatistics":
optional[int64_t] null_count
optional[CArrayStatisticsCountType] null_count
optional[CArrayStatisticsCountType] distinct_count
optional[CArrayStatisticsValueType] min
c_bool is_min_exact
Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/tests/parquet/test_parquet_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,7 @@ def test_read_statistics():
buf.seek(0)

statistics = pq.ParquetFile(buf).read().columns[0].chunks[0].statistics
assert statistics.is_null_count_exact is True
assert statistics.null_count == 1
assert statistics.distinct_count is None
# TODO: add tests for is_distinct_count_exact == None and True
Expand Down
12 changes: 12 additions & 0 deletions ruby/red-arrow/lib/arrow/array-statistics.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,18 @@

module Arrow
class ArrayStatistics
if method_defined?(:null_count_exact)
alias_method :null_count_raw, :null_count
def null_count
return nil unless has_null_count?
if null_count_exact?
null_count_exact
else
null_count_approximate
end
end
end

if method_defined?(:distinct_count_exact)
alias_method :distinct_count_raw, :distinct_count
def distinct_count
Expand Down
Loading
Loading