Skip to content

Commit c10847c

Browse files
andishgarkou
andauthored
GH-47103: [Statistics][C++] Implement Statistics specification attribute ARROW:null_count:approximate (#47969)
### Rationale for this change Enable `ARROW:null_count:approximate` support for `arrow::ArrayStatistics`, along with the corresponding GLib, Ruby and Python bindings. ### What changes are included in this PR? Enable `ARROW:null_count:approximate` in C++ and bind it to `ArrayStatistics` in GLib, Ruby and Python. ### Are these changes tested? Yes, I ran the relevant unit tests. ### Are there any user-facing changes? Yes. * The type of `arrow::ArrayStatistics::null_count` has been changed from `std::optional<int64_t>` to `std::optional<CountType>` * New `garrow_array_statistics_is_null_count_exact()`/`garrow_array_statistics_get_null_count_{exact,approximate}()` functions in GLib. * Add support for approximate value in `Arrow::ArrayStatistics#null_count` in Ruby. * A new field `is_null_count_exact` has been added to `ArrayStatistics` in Python. * GitHub Issue: #47103 Lead-authored-by: arash andishgar <[email protected]> Co-authored-by: Sutou Kouhei <[email protected]> Signed-off-by: Sutou Kouhei <[email protected]>
1 parent c6aad8d commit c10847c

File tree

15 files changed

+200
-25
lines changed

15 files changed

+200
-25
lines changed

c_glib/arrow-glib/basic-array.cpp

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,22 @@ garrow_array_statistics_has_null_count(GArrowArrayStatistics *statistics)
456456
return priv->statistics.null_count.has_value();
457457
}
458458

459+
/**
460+
* garrow_array_statistics_is_null_count_exact:
461+
* @statistics: A #GArrowArrayStatistics.
462+
*
463+
* Returns: %TRUE if the null count is available and exact, %FALSE otherwise.
464+
*
465+
* Since: 23.0.0
466+
*/
467+
gboolean
468+
garrow_array_statistics_is_null_count_exact(GArrowArrayStatistics *statistics)
469+
{
470+
auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics);
471+
return priv->statistics.null_count.has_value() &&
472+
std::holds_alternative<int64_t>(*priv->statistics.null_count);
473+
}
474+
459475
/**
460476
* garrow_array_statistics_get_null_count:
461477
* @statistics: A #GArrowArrayStatistics.
@@ -464,19 +480,59 @@ garrow_array_statistics_has_null_count(GArrowArrayStatistics *statistics)
464480
* -1 otherwise.
465481
*
466482
* Since: 20.0.0
483+
*
484+
* Deprecated: 23.0.0. Use garrow_array_statistics_is_null_count_exact(),
485+
* garrow_array_statistics_get_null_count_exact() and
486+
* garrow_array_statistics_get_null_count_approximate() instead.
467487
*/
468488
gint64
469489
garrow_array_statistics_get_null_count(GArrowArrayStatistics *statistics)
490+
{
491+
return garrow_array_statistics_get_null_count_exact(statistics);
492+
}
493+
494+
/**
495+
* garrow_array_statistics_get_null_count_exact:
496+
* @statistics: A #GArrowArrayStatistics.
497+
*
498+
* Returns: 0 or larger value if @statistics has a valid exact null
499+
* count value, -1 otherwise.
500+
*
501+
* Since: 23.0.0
502+
*/
503+
gint64
504+
garrow_array_statistics_get_null_count_exact(GArrowArrayStatistics *statistics)
470505
{
471506
auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics);
472507
const auto &null_count = priv->statistics.null_count;
473-
if (null_count) {
474-
return null_count.value();
508+
if (null_count && std::holds_alternative<int64_t>(*null_count)) {
509+
return std::get<int64_t>(*null_count);
475510
} else {
476511
return -1;
477512
}
478513
}
479514

515+
/**
516+
* garrow_array_statistics_get_null_count_approximate:
517+
* @statistics: A #GArrowArrayStatistics.
518+
*
519+
* Returns: Non `NaN` value if @statistics has a valid approximate
520+
* null count value, `NaN` otherwise.
521+
*
522+
* Since: 23.0.0
523+
*/
524+
gdouble
525+
garrow_array_statistics_get_null_count_approximate(GArrowArrayStatistics *statistics)
526+
{
527+
auto priv = GARROW_ARRAY_STATISTICS_GET_PRIVATE(statistics);
528+
const auto &null_count = priv->statistics.null_count;
529+
if (null_count && std::holds_alternative<double>(*null_count)) {
530+
return std::get<double>(*null_count);
531+
} else {
532+
return std::nan("");
533+
}
534+
}
535+
480536
/**
481537
* garrow_array_statistics_has_distinct_count:
482538
* @statistics: A #GArrowArrayStatistics.

c_glib/arrow-glib/basic-array.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,21 @@ struct _GArrowArrayStatisticsClass
5454
GARROW_AVAILABLE_IN_20_0
5555
gboolean
5656
garrow_array_statistics_has_null_count(GArrowArrayStatistics *statistics);
57+
GARROW_AVAILABLE_IN_23_0
58+
gboolean
59+
garrow_array_statistics_is_null_count_exact(GArrowArrayStatistics *statistics);
60+
#ifndef GARROW_DISABLE_DEPRECATED
5761
GARROW_AVAILABLE_IN_20_0
62+
GARROW_DEPRECATED_IN_23_0_FOR(garrow_array_statistics_get_null_count_exact)
5863
gint64
5964
garrow_array_statistics_get_null_count(GArrowArrayStatistics *statistics);
65+
#endif
66+
GARROW_AVAILABLE_IN_23_0
67+
gint64
68+
garrow_array_statistics_get_null_count_exact(GArrowArrayStatistics *statistics);
69+
GARROW_AVAILABLE_IN_23_0
70+
gdouble
71+
garrow_array_statistics_get_null_count_approximate(GArrowArrayStatistics *statistics);
6072

6173
GARROW_AVAILABLE_IN_21_0
6274
gboolean

c_glib/test/test-array-statistics.rb

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,20 @@ def setup
4545
end
4646
end
4747

48-
test("#null_count") do
49-
assert_equal(1, @statistics.null_count)
48+
test("#null_count_exact?") do
49+
assert do
50+
@statistics.null_count_exact?
51+
end
52+
end
53+
54+
test("#null_count_exact") do
55+
assert_equal(1, @statistics.null_count_exact)
56+
end
57+
58+
test("#null_count_approximate") do
59+
assert do
60+
@statistics.null_count_approximate.nan?
61+
end
5062
end
5163

5264
test("#has_distinct_count?") do

cpp/src/arrow/array/array_test.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3934,7 +3934,7 @@ class TestArrayDataStatistics : public ::testing::Test {
39343934

39353935
protected:
39363936
std::vector<uint8_t> valids_;
3937-
size_t null_count_;
3937+
int64_t null_count_;
39383938
double distinct_count_;
39393939
double max_byte_width_;
39403940
double average_byte_width_;
@@ -3951,7 +3951,7 @@ TEST_F(TestArrayDataStatistics, MoveConstructor) {
39513951
ArrayData moved_data(std::move(copied_data));
39523952

39533953
ASSERT_TRUE(moved_data.statistics->null_count.has_value());
3954-
ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
3954+
ASSERT_EQ(null_count_, std::get<int64_t>(moved_data.statistics->null_count.value()));
39553955

39563956
ASSERT_TRUE(moved_data.statistics->distinct_count.has_value());
39573957
ASSERT_DOUBLE_EQ(distinct_count_,
@@ -3981,7 +3981,7 @@ TEST_F(TestArrayDataStatistics, CopyConstructor) {
39813981
ArrayData copied_data(*data_);
39823982

39833983
ASSERT_TRUE(copied_data.statistics->null_count.has_value());
3984-
ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
3984+
ASSERT_EQ(null_count_, std::get<int64_t>(copied_data.statistics->null_count.value()));
39853985

39863986
ASSERT_TRUE(copied_data.statistics->distinct_count.has_value());
39873987
ASSERT_DOUBLE_EQ(distinct_count_,
@@ -4013,7 +4013,7 @@ TEST_F(TestArrayDataStatistics, MoveAssignment) {
40134013
moved_data = std::move(copied_data);
40144014

40154015
ASSERT_TRUE(moved_data.statistics->null_count.has_value());
4016-
ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());
4016+
ASSERT_EQ(null_count_, std::get<int64_t>(moved_data.statistics->null_count.value()));
40174017

40184018
ASSERT_TRUE(moved_data.statistics->distinct_count.has_value());
40194019
ASSERT_DOUBLE_EQ(distinct_count_,
@@ -4044,7 +4044,7 @@ TEST_F(TestArrayDataStatistics, CopyAssignment) {
40444044
copied_data = *data_;
40454045

40464046
ASSERT_TRUE(copied_data.statistics->null_count.has_value());
4047-
ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());
4047+
ASSERT_EQ(null_count_, std::get<int64_t>(copied_data.statistics->null_count.value()));
40484048

40494049
ASSERT_TRUE(copied_data.statistics->distinct_count.has_value());
40504050
ASSERT_DOUBLE_EQ(distinct_count_,
@@ -4075,7 +4075,7 @@ TEST_F(TestArrayDataStatistics, CopyTo) {
40754075
data_->CopyTo(arrow::default_cpu_memory_manager()));
40764076

40774077
ASSERT_TRUE(copied_data->statistics->null_count.has_value());
4078-
ASSERT_EQ(null_count_, copied_data->statistics->null_count.value());
4078+
ASSERT_EQ(null_count_, std::get<int64_t>(copied_data->statistics->null_count.value()));
40794079

40804080
ASSERT_TRUE(copied_data->statistics->min.has_value());
40814081
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data->statistics->min.value()));

cpp/src/arrow/array/statistics.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,9 @@ struct ARROW_EXPORT ArrayStatistics {
7676
}
7777

7878
/// \brief The number of null values, may not be set
79-
std::optional<int64_t> null_count = std::nullopt;
79+
/// Note: when set to `int64_t`, it represents `exact_null_count`,
80+
/// and when set to `double`, it represents `approximate_null_count`.
81+
std::optional<CountType> null_count = std::nullopt;
8082

8183
/// \brief The number of distinct values, may not be set
8284
/// Note: when set to `int64_t`, it represents `exact_distinct_count`,

cpp/src/arrow/array/statistics_test.cc

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,20 @@
2525

2626
namespace arrow {
2727

28-
TEST(TestArrayStatistics, NullCount) {
28+
TEST(TestArrayStatistics, NullCountExact) {
2929
ArrayStatistics statistics;
3030
ASSERT_FALSE(statistics.null_count.has_value());
3131
statistics.null_count = 29;
3232
ASSERT_TRUE(statistics.null_count.has_value());
33-
ASSERT_EQ(29, statistics.null_count.value());
33+
ASSERT_EQ(29, std::get<int64_t>(statistics.null_count.value()));
34+
}
35+
36+
TEST(TestArrayStatistics, NullCountApproximate) {
37+
ArrayStatistics statistics;
38+
ASSERT_FALSE(statistics.null_count.has_value());
39+
statistics.null_count = 29.0;
40+
ASSERT_TRUE(statistics.null_count.has_value());
41+
ASSERT_DOUBLE_EQ(29.0, std::get<double>(statistics.null_count.value()));
3442
}
3543

3644
TEST(TestArrayStatistics, DistinctCountExact) {
@@ -106,11 +114,18 @@ TEST(TestArrayStatistics, Equals) {
106114

107115
ASSERT_EQ(statistics1, statistics2);
108116

117+
// Test NULL_COUNT_EXACT
109118
statistics1.null_count = 29;
110119
ASSERT_NE(statistics1, statistics2);
111120
statistics2.null_count = 29;
112121
ASSERT_EQ(statistics1, statistics2);
113122

123+
// Test NULL_COUNT_APPROXIMATE
124+
statistics1.null_count = 29.0;
125+
ASSERT_NE(statistics1, statistics2);
126+
statistics2.null_count = 29.0;
127+
ASSERT_EQ(statistics1, statistics2);
128+
114129
// Test DISTINCT_COUNT_EXACT
115130
statistics1.distinct_count = static_cast<int64_t>(2929);
116131
ASSERT_NE(statistics1, statistics2);

cpp/src/arrow/compare.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1563,7 +1563,8 @@ bool ArrayStatisticsOptionalValueEquals(const std::optional<Type>& left,
15631563

15641564
bool ArrayStatisticsEqualsImpl(const ArrayStatistics& left, const ArrayStatistics& right,
15651565
const EqualOptions& equal_options) {
1566-
return left.null_count == right.null_count &&
1566+
return ArrayStatisticsOptionalValueEquals(left.null_count, right.null_count,
1567+
equal_options) &&
15671568
ArrayStatisticsOptionalValueEquals(left.distinct_count, right.distinct_count,
15681569
equal_options) &&
15691570
ArrayStatisticsOptionalValueEquals(left.max_byte_width, right.max_byte_width,

cpp/src/arrow/record_batch.cc

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -536,10 +536,17 @@ Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics on_stat
536536
statistics.nth_column = nth_column;
537537
if (column_statistics->null_count.has_value()) {
538538
statistics.nth_statistics++;
539-
statistics.key = ARROW_STATISTICS_KEY_NULL_COUNT_EXACT;
540-
statistics.type = int64();
541-
statistics.value = column_statistics->null_count.value();
542-
RETURN_NOT_OK(on_statistics(statistics));
539+
if (std::holds_alternative<int64_t>(column_statistics->null_count.value())) {
540+
statistics.key = ARROW_STATISTICS_KEY_NULL_COUNT_EXACT;
541+
statistics.type = int64();
542+
statistics.value = std::get<int64_t>(column_statistics->null_count.value());
543+
RETURN_NOT_OK(on_statistics(statistics));
544+
} else {
545+
statistics.key = ARROW_STATISTICS_KEY_NULL_COUNT_APPROXIMATE;
546+
statistics.type = float64();
547+
statistics.value = std::get<double>(column_statistics->null_count.value());
548+
RETURN_NOT_OK(on_statistics(statistics));
549+
}
543550
statistics.start_new_column = false;
544551
}
545552

cpp/src/arrow/record_batch_test.cc

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1456,7 +1456,7 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayRowCount) {
14561456
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
14571457
}
14581458

1459-
TEST_F(TestRecordBatch, MakeStatisticsArrayNullCount) {
1459+
TEST_F(TestRecordBatch, MakeStatisticsArrayNullCountExact) {
14601460
auto schema =
14611461
::arrow::schema({field("no-statistics", boolean()), field("int32", int32())});
14621462
auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
@@ -1486,6 +1486,37 @@ TEST_F(TestRecordBatch, MakeStatisticsArrayNullCount) {
14861486
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
14871487
}
14881488

1489+
TEST_F(TestRecordBatch, MakeStatisticsArrayNullCountApproximate) {
1490+
auto schema =
1491+
::arrow::schema({field("no-statistics", boolean()), field("int32", int32())});
1492+
auto no_statistics_array = ArrayFromJSON(boolean(), "[true, false, true]");
1493+
auto int32_array_data = ArrayFromJSON(int32(), "[1, null, -1]")->data()->Copy();
1494+
int32_array_data->statistics = std::make_shared<ArrayStatistics>();
1495+
int32_array_data->statistics->null_count = 1.0;
1496+
auto int32_array = MakeArray(std::move(int32_array_data));
1497+
auto batch = RecordBatch::Make(schema, int32_array->length(),
1498+
{no_statistics_array, int32_array});
1499+
1500+
ASSERT_OK_AND_ASSIGN(auto statistics_array, batch->MakeStatisticsArray());
1501+
1502+
ASSERT_OK_AND_ASSIGN(
1503+
auto expected_statistics_array,
1504+
MakeStatisticsArray("[null, 1]",
1505+
{{
1506+
ARROW_STATISTICS_KEY_ROW_COUNT_EXACT,
1507+
},
1508+
{
1509+
ARROW_STATISTICS_KEY_NULL_COUNT_APPROXIMATE,
1510+
}},
1511+
{{
1512+
ArrayStatistics::ValueType{int64_t{3}},
1513+
},
1514+
{
1515+
ArrayStatistics::ValueType{1.0},
1516+
}}));
1517+
AssertArraysEqual(*expected_statistics_array, *statistics_array, true);
1518+
}
1519+
14891520
TEST_F(TestRecordBatch, MakeStatisticsArrayDistinctCountExact) {
14901521
auto schema =
14911522
::arrow::schema({field("no-statistics", boolean()), field("int32", int32())});

cpp/src/parquet/arrow/arrow_statistics_test.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,8 @@ void TestStatisticsReadArray(std::shared_ptr<::arrow::DataType> arrow_type) {
236236
auto statistics = typed_read_array->statistics();
237237
ASSERT_NE(nullptr, statistics);
238238
ASSERT_EQ(true, statistics->null_count.has_value());
239-
ASSERT_EQ(1, statistics->null_count.value());
239+
ASSERT_EQ(true, std::holds_alternative<int64_t>(statistics->null_count.value()));
240+
ASSERT_EQ(1, std::get<int64_t>(statistics->null_count.value()));
240241
ASSERT_EQ(false, statistics->distinct_count.has_value());
241242
ASSERT_EQ(true, statistics->min.has_value());
242243
ASSERT_EQ(true, std::holds_alternative<MinMaxType>(*statistics->min));
@@ -356,7 +357,8 @@ TEST(TestStatisticsRead, MultipleRowGroupsShouldLoadStatistics) {
356357
auto statistics = typed_read_array->statistics();
357358
ASSERT_NE(nullptr, statistics);
358359
ASSERT_EQ(true, statistics->null_count.has_value());
359-
ASSERT_EQ(1, statistics->null_count.value());
360+
ASSERT_EQ(true, std::holds_alternative<int64_t>(statistics->null_count.value()));
361+
ASSERT_EQ(1, std::get<int64_t>(statistics->null_count.value()));
360362
ASSERT_EQ(false, statistics->distinct_count.has_value());
361363
ASSERT_EQ(true, statistics->min.has_value());
362364
// This is not -1 because this array has only the first 2 elements.

0 commit comments

Comments
 (0)