Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
214 commits
Select commit Hold shift + click to select a range
d5aa7c4
Initial Commit
jacques-n Feb 5, 2016
cbc56bf
Update readme and add license in root.
jacques-n Feb 17, 2016
fa5f029
ARROW-1: Initial Arrow Code Commit
StevenMPhillips Feb 17, 2016
16e44e3
ARROW-3: This patch includes a WIP draft specification document for t…
wesm Feb 17, 2016
23c4b08
ARROW-4: This provides an partial C++11 implementation of the Apache …
wesm Feb 17, 2016
7e76e3a
ARROW-5: Update drill-fmpp-maven-plugin to 1.5.0
lw-lin Feb 19, 2016
e9cc8ce
ARROW-5: Correct Apache Maven repo for maven plugin use
jacques-n Feb 20, 2016
e6905ef
ARROW-9: Replace straggler references to Drill
lw-lin Feb 20, 2016
a385622
ARROW-8: Add .travis.yml and test script for Arrow C++. OS X build fixes
wesm Mar 1, 2016
8f2ca24
ARROW-13: Add PR merge tool from parquet-mr, suitably modified
wesm Mar 3, 2016
1000d11
ARROW-36: Remove fixVersions from JIRA resolve code path
wesm Mar 3, 2016
e418020
ARROW-19: Add an externalized MemoryPool interface for use in builder…
wesm Mar 3, 2016
b88b69e
ARROW-20: Add null_count_ member to array containers, remove nullable…
wesm Mar 3, 2016
89c6afd
ARROW-21: Implement a simple in-memory Schema data structure
wesm Mar 3, 2016
307977e
ARROW-15: Fix a naming typo for memory.AllocationManager.AllocationOu…
lw-lin Mar 4, 2016
0c95d3c
ARROW-10: Fix mismatch of javadoc names and method parameters
lw-lin Mar 4, 2016
3b777c7
ARROW-26: Add instructions for enabling Arrow C++ Parquet adapter build
wesm Mar 4, 2016
9c2b954
ARROW-23: Add a logical Column data structure
wesm Mar 4, 2016
612fbc7
ARROW-24: C++: Implement a logical Table container type
wesm Mar 5, 2016
572cdf2
ARROW-7: Add barebones Python library build toolchain
wesm Mar 7, 2016
8caa287
ARROW-35: Add a short call-to-action in the top level README.md
wesm Mar 7, 2016
571343b
ARROW-9: Rename some unchanged "Drill" to "Arrow" (follow-up)
HyukjinKwon Mar 7, 2016
9afb667
ARROW-31: Python: prototype user object model, add PyList conversion …
wesm Mar 7, 2016
ae95dbd
ARROW-44: Python: prototype object model for array slot values ("scal…
wesm Mar 8, 2016
45cd9fd
ARROW-43: Python: format array values to in __repr__ for interactive …
wesm Mar 8, 2016
1650026
ARROW-17: set some vector fields to package level access for Drill co…
StevenMPhillips Mar 1, 2016
243ed4e
ARROW-18: Fix decimal precision and scale in MapWriters
StevenMPhillips Mar 1, 2016
31def7d
ARROW-51: Add simple ValueVector tests
StevenMPhillips Mar 8, 2016
e822ea7
ARROW-46: ListVector should initialize bits in allocateNew
minji-kim Mar 7, 2016
8367527
ARROW-42: Add Python tests to Travis CI build
wesm Mar 9, 2016
6fdcd49
ARROW-54: [Python] Rename package to "pyarrow"
wesm Mar 9, 2016
883c62b
ARROW-55: [Python] Fix unit tests in 2.7
danrobinson Mar 16, 2016
5881aac
ARROW-64: Add zsh support to C++ build scripts
xhochy Mar 16, 2016
c996610
ARROW-68: Better error handling for not fully setup systems
emkornfield Mar 17, 2016
3a99f39
ARROW-73: Support older CMake versions
xhochy Mar 21, 2016
016b92b
ARROW-72: Search for alternative parquet-cpp header
xhochy Mar 21, 2016
4ec034b
ARROW-28: Adding google's benchmark library to the toolchain
emkornfield Mar 22, 2016
093f9bd
ARROW-75: Fix handling of empty strings
danrobinson Mar 22, 2016
65db0da
ARROW-67: C++ metadata flatbuffer serialization and data movement to …
wesm Mar 23, 2016
a4002c6
ARROW-70: Add adapt 'lite' DCHECK macros from Kudu as also used in Pa…
wesm Mar 23, 2016
fbbee3d
ARROW-77: [C++] Conform bitmap interpretation to ARROW-62; 1 for null…
wesm Mar 24, 2016
c06b765
ARROW-62: Clarify null bitmap interpretation, indicate bit-endianness…
wesm Mar 25, 2016
0a8979d
ARROW-37: [C++ / Python] Implement BooleanArray and BooleanBuilder. H…
wesm Mar 25, 2016
d3cb6b4
ARROW-22: [C++] Convert flat Parquet schemas to Arrow schemas
wesm Mar 27, 2016
d6d53b2
ARROW-63: [C++] Enable ctest to work on systems with Python 3 as the …
wesm Mar 27, 2016
0171877
ARROW-65: Be less restrictive on PYTHON_LIBRARY search paths
xhochy Mar 27, 2016
1fd0668
ARROW-30: [Python] Routines for converting between arrow::Array/Table…
wesm Mar 28, 2016
ecadd0b
ARROW-80: Handle len call for pre-init arrays
xhochy Mar 28, 2016
80ec2c1
ARROW-79: [Python] Add benchmarks
xhochy Mar 28, 2016
df7726d
ARROW-88: [C++] Refactor usages of parquet_cpp namespace
wesm Mar 28, 2016
38897ee
ARROW-83: [C++] Add basic test infrastructure for DecimalType
xhochy Mar 28, 2016
2d8627c
ARROW-87: [C++] Add all four possible ways to encode Decimals in Parq…
xhochy Mar 28, 2016
5a68f8d
ARROW-93: Fix builds when using XCode 7.3
danrobinson Mar 31, 2016
b3ebce1
ARROW-89: [Python] Add benchmarks for Arrow<->Pandas conversion
xhochy Apr 1, 2016
6d31d59
ARROW-49: [Python] Add Column and Table wrapper interface
xhochy Apr 1, 2016
79fddd1
ARROW-90: [C++] Check for SIMD instruction set support
xhochy Apr 1, 2016
5d12999
ARROW-71: [C++] Add clang-tidy and clang-format to the the tool chain.
emkornfield Apr 2, 2016
9d88a50
ARROW-86: [Python] Implement zero-copy Arrow-to-Pandas conversion
xhochy Apr 3, 2016
7b2153b
ARROW-85: memcmp can be avoided in Equal when comparing with the same …
Apr 11, 2016
37f7271
ARROW-94: [Format] Expand list example to clarify null vs empty list
emkornfield Apr 14, 2016
5843e68
ARROW-103: Add files to gitignore
danrobinson Apr 17, 2016
0b472d8
ARROW-82: Initial IPC support for ListArray
emkornfield Apr 18, 2016
a541644
ARROW-100: [C++] Computing RowBatch size
pcmoritz Apr 23, 2016
56514d9
ARROW-104: [FORMAT] Add alignment and padding requirements + union cl…
emkornfield Apr 30, 2016
355f7c9
ARROW-92: Arrow to Parquet Schema conversion
xhochy May 1, 2016
ad3d01d
ARROW-188: Add numpy as install requirement
xhochy May 4, 2016
3302257
ARROW-190: Python: Provide installable sdist builds
xhochy May 9, 2016
c9ffe54
ARROW-194: C++: Allow read-only memory mapped source
jihoonson May 9, 2016
1f04f7f
ARROW-193: typos "int his" fix to "in this"
lfzCarlosC May 5, 2016
4bd13b8
ARROW-91: Basic Parquet read support
xhochy May 10, 2016
68b80a8
ARROW-197: Working first draft of a conda recipe for pyarrow
wesm May 15, 2016
6968ec0
ARROW-199: [C++] Refine third party dependency
zhangh43 May 15, 2016
9c59158
ARROW-185: Make padding and alignment for all buffers be 64 bytes
emkornfield May 17, 2016
978de1a
ARROW-204: Add Travis CI builds that post conda artifacts for Linux a…
wesm May 18, 2016
e0fb369
ARROW-201: [C++] Initial ParquetWriter implementation
xhochy May 18, 2016
c0985a4
Make BaseValueVector#MAX_ALLOCATION_SIZE configurable
laurentgo Apr 18, 2016
e316b3f
Fix BaseAllocator.java NPE when assertions are disabled
laurentgo Apr 15, 2016
7035467
Add java support to Travis CI
laurentgo Apr 14, 2016
cd1d770
ARROW-206: Expose a C++ api to compare ranges of slots between two ar…
emkornfield May 23, 2016
c8b8078
[Doc] Update Layout.md
ebegoli May 28, 2016
6574095
ARROW-209: [C++] Triage builds due to unavailable LLVM apt repo
wesm Jun 3, 2016
ce2fe7a
ARROW-211: [Format] Fixed typos in layout examples
Smyatkin-Maxim Jun 7, 2016
9ce13a0
ARROW-60: [C++] Struct type builder API
Jun 7, 2016
bc6c4c8
ARROW-200: [C++/Python] Return error status on string initialization …
emkornfield Jun 8, 2016
8197f24
ARROW-212: Change contract of PrimitiveArray to reflect its abstractness
emkornfield Jun 8, 2016
ec66ddd
ARROW-203: Python: Basic filename based Parquet read/write
xhochy Jun 10, 2016
b4e0e93
ARROW-217: Fix Travis w.r.t conda 4.1.0 changes
xhochy Jun 15, 2016
790d541
ARROW-218: Add optional API token authentication option to PR merge tool
wesm Jun 16, 2016
27edd25
ARROW-210: Cleanup of the string related types in C++ code base
emkornfield Jun 16, 2016
a3e3849
ARROW-219: Preserve CMAKE_CXX_FLAGS, fix compiler warnings
wesm Jun 17, 2016
f7ade7b
ARROW-223: Do not link against libpython
xhochy Jun 21, 2016
ef90830
ARROW-222: Prototyping an IO interface for Arrow, with initial HDFS t…
wesm Jun 24, 2016
2f52cf4
ARROW-215: Support other integer types and strings in Parquet I/O
xhochy Jun 12, 2016
fab4c82
ARROW-234: Build libhdfs IO extension in conda artifacts
wesm Jul 1, 2016
77598fa
ARROW-233: Add visibility macros, add static build option
wesm Jul 10, 2016
ff6132f
ARROW-237: Implement parquet-cpp's abstract IO interfaces for memory …
wesm Jul 12, 2016
62390d8
ARROW-106: [C++] Add IPC to binary/string types
emkornfield Jul 13, 2016
55bfa83
ARROW-238: Change InternalMemoryPool::Free() to return Status::Invali…
jihoonson Jul 18, 2016
59e5f98
ARROW-236: Bridging IO interfaces under the hood in pyarrow
wesm Jul 18, 2016
a2fb756
ARROW-241: Add missing implementation for splitAndTransfer in UnionVe…
StevenMPhillips Jul 19, 2016
dc79ceb
ARROW-244: Some global APIs of IPC module should be visible to the ou…
jihoonson Aug 1, 2016
356d015
ARROW-240: Provide more detailed installation instructions for pyarro…
MechCoder Jul 14, 2016
3a2dfba
ARROW-101: Fix java compiler warnings
laurentgo Aug 1, 2016
56835c3
ARROW-246: [Java] UnionVector doesn't call allocateNew() when creatin…
adeneche Aug 1, 2016
5df7d4d
ARROW-247: Missing explicit destructor in RowBatchReader causes an in…
jihoonson Aug 4, 2016
34e7f48
ARROW-250: Fix for ARROW-246 may cause memory leaks
adeneche Aug 5, 2016
2742d37
ARROW-254: remove Bit type as it is redundant with Boolean
julienledem Aug 12, 2016
dc01f09
ARROW-253: restrict ints to 8, 16, 32, or 64 bits in V1
julienledem Aug 12, 2016
e8724f8
ARROW-260: Fix flaky oversized tests
jihoonson Aug 13, 2016
689cd27
ARROW-245: add endianness to RecordBatch
julienledem Aug 15, 2016
268e108
ARROW-251: Expose APIs for getting code and message of the status
jihoonson Aug 16, 2016
246a126
ARROW-107: [C++] Implement IPC for structs
emkornfield Aug 16, 2016
e7e399d
ARROW-259: Use Flatbuffer Field type instead of MaterializedField
StevenMPhillips May 24, 2016
fd2e524
Revert version to 0.1-SNAPSHOT
StevenMPhillips Aug 18, 2016
282fcac
ARROW-265: Pad negative decimal values with1
StevenMPhillips Aug 19, 2016
c2eb161
ARROW-265: Fix few decimal bugs
StevenMPhillips Aug 20, 2016
812201a
ARROW-266: [C++] Fix broken build due to Flatbuffers namespace change
wesm Aug 20, 2016
7861968
ARROW-252: Add implementation guidelines to the documentation
julienledem Aug 20, 2016
8960a2e
ARROW-255: Finalize Dictionary representation
julienledem Aug 20, 2016
ec51d56
ARROW-269: Include typeVector buffers UnionVector.getBuffers()
StevenMPhillips Aug 22, 2016
803afeb
ARROW-264: File format
julienledem Aug 26, 2016
907cc5a
ARROW-262: Start metadata specification document
wesm Aug 28, 2016
e081a4c
ARROW-271: Update Field structure to be more explicit
julienledem Aug 28, 2016
0a411fd
ARROW-242: Support Timestamp Data Type
xhochy Aug 28, 2016
e197b2d
ARROW-279: rename vector module to arrow-vector
julienledem Aug 29, 2016
2d8ec78
ARROW-274: Add NullableMapVector to support nullable maps
julienledem Sep 6, 2016
637584b
ARROW-284: Disable arrow_parquet module in Travis CI to triage builds
wesm Sep 7, 2016
214b861
ARROW-283: [C++] Account for upstream changes in parquet-cpp
wesm Sep 7, 2016
270ab4e
ARROW-278: [Format] Rename Tuple to Struct_ in flatbuffers IDL
wesm Sep 7, 2016
52089d6
ARROW-285: Optional flatc download
laurentgo Sep 8, 2016
a5f2861
ARROW-286: Build thirdparty dependencies in parallel
xhochy Sep 9, 2016
077c72b
ARROW-256: [Format] Add a version number to the IPC/RPC metadata
wesm Sep 9, 2016
6b8abb4
ARROW-289: Install test-util.h
xhochy Sep 13, 2016
6f99156
ARROW-287: Make nullable vectors use a BitVecor instead of UInt1Vecto…
julienledem Sep 13, 2016
3487c2f
ARROW-292: [Java] Upgrade Netty to 4.0.41
Sep 14, 2016
17e90e1
ARROW-290: Specialize alloc() in ArrowBuf
julienledem Sep 16, 2016
559b865
ARROW-280: [C++] Refactor IPC / memory map IO to use common arrow_io …
wesm Sep 18, 2016
5f1556c
ARROW-297: Fix Arrow pom for release
julienledem Sep 19, 2016
5358328
ARROW-298: create release scripts
julienledem Sep 21, 2016
430bd95
ARROW-299: Use absolute namespace in macros
xhochy Sep 21, 2016
7e39747
ARROW-267: [C++] Implement file format layout for IPC/RPC
wesm Sep 21, 2016
32fd692
ARROW-296: [Python / C++] Remove arrow::parquet, make pyarrow link ag…
wesm Sep 25, 2016
45d8832
ARROW-293: [C++] Implement Arrow IO interfaces for operating system f…
wesm Sep 27, 2016
03134b1
ARROW-270: Define more generic Interval logical type
julienledem Sep 27, 2016
bae33d6
ARROW-304: NullableMapReaderImpl.isSet() always returns true
julienledem Sep 27, 2016
768c7d0
ARROW-257: Add a typeids Vector to Union type
julienledem Sep 27, 2016
bd195e3
ARROW-308: UnionListWriter.setPosition() should not call startList()
Sep 28, 2016
bf30235
ARROW-306: Add option to pass cmake arguments via environment variable
xhochy Sep 29, 2016
30f6083
ARROW-305: Add compression and use_dictionary options to Parquet
xhochy Sep 29, 2016
391ab64
ARROW-309: Types.getMinorTypeForArrowType() does not work for Union type
julienledem Sep 30, 2016
c7b0480
ARROW-314: JSONScalar is unnecessary and unused
julienledem Oct 3, 2016
c3930a0
ARROW-301: Add user field metadata to IPC schemas
julienledem Oct 3, 2016
c7e6a07
ARROW-302: [C++/Python] Implement C++ IO interfaces for interacting w…
wesm Oct 4, 2016
c3cfa3d
ARROW-313: Build on any version of XCode
Oct 4, 2016
7fb4d24
ARROW-315: finalize timestamp
julienledem Oct 4, 2016
dd1b95b
ARROW-318: Revise python/README.md given recent changes in codebase
wesm Oct 5, 2016
04cf874
ARROW-321: fix arrow licenses
julienledem Oct 5, 2016
f1a4bd1
ARROW-320: ComplexCopier.copy(FieldReader, FieldWriter) should not st…
Oct 6, 2016
3f85cee
ARROW-324: Update arrow metadata diagram
julienledem Oct 7, 2016
2d8e820
ARROW-319: Add canonical Arrow Schema json representation
julienledem Oct 7, 2016
1196691
ARROW-326: Initialize nested writers in MapWriter based on the underl…
StevenMPhillips Oct 7, 2016
eb1491a
ARROW-325: make TestArrowFile not dependent on timezone
julienledem Oct 8, 2016
e7080ef
[maven-release-plugin] prepare release apache-arrow-0.1.0
julienledem Oct 8, 2016
17cd7a6
[maven-release-plugin] prepare for next development iteration
julienledem Oct 8, 2016
a9747ce
ARROW-312: Read and write Arrow IPC file format from Python
wesm Oct 10, 2016
fb799bc
ARROW-112: Changed constexprs to kValue naming.
leifwalsh Oct 11, 2016
8c8d341
ARROW-326: Include scale and precision when materializing decimal writer
StevenMPhillips Oct 10, 2016
994aa5a
ARROW-189: Build 3rd party with ExternalProject.
leifwalsh Oct 11, 2016
caa843b
ARROW-333: Make writers update their internal schema even when no dat…
julienledem Oct 12, 2016
3919a27
ARROW-332: Add RecordBatch.to_pandas method
wesm Oct 12, 2016
bf749f5
ARROW-275: Add tests for UnionVector in Arrow File
julienledem Oct 12, 2016
4ecf327
ARROW-191: Python: Provide infrastructure for manylinux1 wheels
xhochy Oct 16, 2016
8520061
ARROW-336: Run Apache Rat in Travis builds
xhochy Oct 16, 2016
8e8b17f
ARROW-97: API documentation via sphinx-apidoc
xhochy Oct 16, 2016
732a205
ARROW-261: Refactor String/Binary code paths to reflect unnested (non…
wesm Oct 17, 2016
676c32c
ARROW-317: Add Slice, Copy methods to Buffer
wesm Oct 18, 2016
e2c0a18
ARROW-327: [Python] Remove conda builds from Travis CI setup
wesm Oct 18, 2016
446ec9b
ARROW-334: [Python] Remove INSTALL_RPATH_USE_LINK_PATH
wesm Oct 18, 2016
2f84493
ARROW-342: Set Python version on release
xhochy Oct 21, 2016
3d2e4df
ARROW-337: UnionListWriter.list() is doing more than it should, this …
Oct 26, 2016
6178bf7
ARROW-350: Added Kerberos to HDFS client
Oct 29, 2016
da24c1a
ARROW-339: Python 3 compatibility in merge_arrow_pr.py
wesm Oct 29, 2016
d946e79
ARROW-354: Fix comparison of arrays of empty strings
xhochy Oct 29, 2016
772bc6e
ARROW-349: Add six as a requirement
hoffmann Oct 30, 2016
ca088dd
ARROW-339: [Dev] Lingering Python 3 fixes
wesm Nov 1, 2016
d414875
ARROW-348: [Python] Add build-type command line option to setup.py, b…
wesm Nov 1, 2016
c7db80e
ARROW-355: Add tests for serialising arrays of empty strings to Parquet
xhochy Nov 1, 2016
e70d97d
ARROW-358: Add explicit environment variable to locate libhdfs in one…
wesm Nov 2, 2016
2a059bd
ARROW-359: Document ARROW_LIBHDFS_DIR
Nov 2, 2016
17c9ae7
ARROW-357: Use a single RowGroup for Parquet files as default.
xhochy Nov 2, 2016
25e0106
ARROW-323: [Python] Opt-in to pyarrow.parquet extension rather than a…
wesm Nov 3, 2016
e8bc1fe
ARROW-368: Added note for LD_LIBRARY_PATH in Python README
BryanCutler Nov 6, 2016
121e826
ARROW-361: Python: Support reading a column-selection from Parquet files
xhochy Nov 6, 2016
79344b3
ARROW-362: Fix memory leak in zero-copy arrow to NumPy/pandas conversion
wesm Nov 7, 2016
6996c17
ARROW-312: [Java] IPC file round trip tool for integration testing
julienledem Nov 8, 2016
4fa7ac4
ARROW-372: json vector serialization format
julienledem Nov 9, 2016
7f048a4
ARROW-356: Add documentation about reading Parquet
xhochy Nov 11, 2016
48f9780
ARROW-375: Fix unicode Python 3 issue in columns argument of parquet.…
wesm Nov 11, 2016
78288b5
ARROW-371: Handle pandas-nullable types correctly
xhochy Nov 16, 2016
8417096
ARROW-367: converter json <=> Arrow file format for Integration tests
julienledem Nov 18, 2016
ed6ec3b
ARROW-373: [C++] JSON serialization format for testing
wesm Nov 18, 2016
58bd7be
implement dense unions
pcmoritz Nov 18, 2016
c88bd70
Build arrow_io and arrow_ipc as static libraries.
robertnishihara Nov 20, 2016
bdae7a2
builder: speed up bitsetting for large length
atumanov Mar 5, 2017
ae9c5d9
Merge pull request #3 from atumanov/fast-bitsetting
pcmoritz Mar 5, 2017
9caa1d1
Merge pull request #1 from pcmoritz/static
pcmoritz Mar 8, 2017
90902e9
upgrade flatbuffers
pcmoritz Mar 8, 2017
a4a5526
Merge pull request #4 from pcmoritz/upgrade-flatbuf
pcmoritz Mar 8, 2017
1924a78
parallelize memcopy in arrow with openmp
atumanov Mar 9, 2017
d501ad2
parallelize memcopy in arrow with openmp
atumanov Mar 9, 2017
3be7bdc
Merge branch 'parallel-arrow-memcpy' of github.com:atumanov/arrow int…
atumanov Mar 9, 2017
a2aeb01
fully switch arrow memcpy parallelization to c++ threads
atumanov Mar 13, 2017
59fe77a
parallelize arrow memset: add to memory util
atumanov Mar 13, 2017
a61194a
arrow: adding reusable threadpool for parallel memcpy+memset
atumanov Mar 14, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ set(THIRDPARTY_DIR "${CMAKE_SOURCE_DIR}/thirdparty")

set(GTEST_VERSION "1.7.0")
set(GBENCHMARK_VERSION "1.0.0")
set(FLATBUFFERS_VERSION "1.3.0")
set(FLATBUFFERS_VERSION "1.6.0")

find_package(ClangTools)
if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR CLANG_TIDY_FOUND)
Expand Down
18 changes: 16 additions & 2 deletions cpp/src/arrow/builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,24 @@ void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int32_t leng

void ArrayBuilder::UnsafeSetNotNull(int32_t length) {
const int32_t new_length = length + length_;
// TODO(emkornfield) Optimize for large values of length?
for (int32_t i = length_; i < new_length; ++i) {

// Fill up the bytes until we have a byte alignment
int32_t pad_to_byte = 8 - (length_ % 8);
if (pad_to_byte == 8) { pad_to_byte = 0; }
for (int32_t i = 0; i < pad_to_byte; ++i) {
BitUtil::SetBit(null_bitmap_data_, i);
}

// Fast bitsetting
int32_t fast_length = (length - pad_to_byte) / 8;
memset(null_bitmap_data_ + ((length_ + pad_to_byte) / 8), 255,
static_cast<size_t>(fast_length));

// Trailing bytes
for (int32_t i = length_ + pad_to_byte + (fast_length * 8); i < new_length; ++i) {
BitUtil::SetBit(null_bitmap_data_, i);
}

length_ = new_length;
}

Expand Down
9 changes: 9 additions & 0 deletions cpp/src/arrow/builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,15 @@ class ARROW_EXPORT ArrayBuilder {
DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
};

class ARROW_EXPORT NullArrayBuilder : public ArrayBuilder {
public:
explicit NullArrayBuilder(MemoryPool* pool, const TypePtr& type) : ArrayBuilder(pool, type) {}
virtual ~NullArrayBuilder() {};
Status Finish(std::shared_ptr<Array>* out) override {
return Status::OK();
}
};

} // namespace arrow

#endif // ARROW_BUILDER_H_
4 changes: 3 additions & 1 deletion cpp/src/arrow/io/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,11 @@ if(ARROW_HDFS)
${ARROW_IO_TEST_LINK_LIBS})
endif()

add_library(arrow_io SHARED
add_library(arrow_io STATIC
${ARROW_IO_SRCS}
)
set_property(TARGET arrow_io PROPERTY POSITION_INDEPENDENT_CODE 1)

target_link_libraries(arrow_io
LINK_PUBLIC ${ARROW_IO_LINK_LIBS}
LINK_PRIVATE ${ARROW_IO_PRIVATE_LINK_LIBS})
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/arrow/ipc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,10 @@ set(ARROW_IPC_SRCS
)

# TODO(wesm): SHARED and STATIC targets
add_library(arrow_ipc SHARED
add_library(arrow_ipc STATIC
${ARROW_IPC_SRCS}
)
set_property(TARGET arrow_ipc PROPERTY POSITION_INDEPENDENT_CODE 1)
if(FLATBUFFERS_VENDORED)
add_dependencies(arrow_ipc flatbuffers_ep)
endif()
Expand Down
24 changes: 24 additions & 0 deletions cpp/src/arrow/ipc/adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "arrow/types/primitive.h"
#include "arrow/types/string.h"
#include "arrow/types/struct.h"
#include "arrow/types/union.h"
#include "arrow/util/bit-util.h"
#include "arrow/util/buffer.h"
#include "arrow/util/logging.h"
Expand Down Expand Up @@ -115,6 +116,13 @@ Status VisitArray(const Array* arr, std::vector<flatbuf::FieldNode>* field_nodes
RETURN_NOT_OK(
VisitArray(field.get(), field_nodes, buffers, max_recursion_depth - 1));
}
} else if (arr->type_enum() == Type::UNION) {
const auto union_arr = static_cast<const UnionArray*>(arr);
buffers->push_back(union_arr->types());
buffers->push_back(union_arr->offset_buf());
for (auto& child_arr : union_arr->children()) {
RETURN_NOT_OK(VisitArray(child_arr.get(), field_nodes, buffers, max_recursion_depth - 1));
}
} else {
return Status::NotImplemented("Unrecognized type");
}
Expand Down Expand Up @@ -363,6 +371,22 @@ class RecordBatchReader::RecordBatchReaderImpl {
out->reset(new StructArray(
type, field_meta.length, fields, field_meta.null_count, null_bitmap));
return Status::OK();
} else if (type->type == Type::UNION) {
std::shared_ptr<Buffer> types;
RETURN_NOT_OK(GetBuffer(buffer_index_++, &types));
std::shared_ptr<Buffer> offset_buf;
RETURN_NOT_OK(GetBuffer(buffer_index_++, &offset_buf));
auto union_type = std::dynamic_pointer_cast<UnionType>(type);
const int num_children = union_type->num_children();
std::vector<ArrayPtr> results;
for (int child_idx = 0; child_idx < num_children; ++child_idx) {
std::shared_ptr<Array> result;
RETURN_NOT_OK(NextArray(union_type->child(child_idx).get(), max_recursion_depth - 1, &result));
results.push_back(result);
}
out->reset(new UnionArray(
type, field_meta.length, results, types, offset_buf, field_meta.null_count, null_bitmap));
return Status::OK();
}

return Status::NotImplemented("Non-primitive types not complete yet");
Expand Down
17 changes: 17 additions & 0 deletions cpp/src/arrow/ipc/ipc-metadata-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "arrow/schema.h"
#include "arrow/test-util.h"
#include "arrow/type.h"
#include "arrow/types/union.h"
#include "arrow/util/status.h"

namespace arrow {
Expand Down Expand Up @@ -97,6 +98,22 @@ TEST_F(TestSchemaMessage, NestedFields) {
CheckRoundtrip(&schema);
}

TEST_F(TestSchemaMessage, UnionType) {
auto f0 = std::make_shared<Field>("f0", TypePtr(new Int32Type()));
auto f1 = std::make_shared<Field>("f1", TypePtr(new Int64Type()));
std::vector<uint8_t> type_ids = {}; // TODO(pcm): Implement typeIds
auto ud = TypePtr(new UnionType(std::vector<std::shared_ptr<Field>>({f0, f1}),
type_ids, UnionMode::DENSE));
auto fd = std::make_shared<Field>("f", ud);
Schema schema_dense({fd});
CheckRoundtrip(&schema_dense);
auto us = TypePtr(new UnionType(std::vector<std::shared_ptr<Field>>({f0, f1}),
type_ids, UnionMode::SPARSE));
auto fs = std::make_shared<Field>("f", us);
Schema schema_sparse({fs});
CheckRoundtrip(&schema_sparse);
}

class TestFileFooter : public ::testing::Test {
public:
void SetUp() {}
Expand Down
32 changes: 30 additions & 2 deletions cpp/src/arrow/ipc/metadata-internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "arrow/ipc/Message_generated.h"
#include "arrow/schema.h"
#include "arrow/type.h"
#include "arrow/types/union.h"
#include "arrow/util/buffer.h"
#include "arrow/util/status.h"

Expand Down Expand Up @@ -119,8 +120,20 @@ static Status TypeFromFlatbuffer(flatbuf::Type type, const void* type_data,
case flatbuf::Type_Struct_:
*out = std::make_shared<StructType>(children);
return Status::OK();
case flatbuf::Type_Union:
return Status::NotImplemented("Type is not implemented");
case flatbuf::Type_Union: {
std::vector<uint8_t> type_ids = {}; // TODO(pcm): Implement typeIds
auto union_data = static_cast<const flatbuf::Union*>(type_data);
UnionMode mode;
if (union_data->mode() == flatbuf::UnionMode_Sparse) {
mode = UnionMode::SPARSE;
} else if (union_data->mode() == flatbuf::UnionMode_Dense) {
mode = UnionMode::DENSE;
} else {
return Status::Invalid("Unrecognized UnionMode");
}
*out = std::make_shared<UnionType>(children, type_ids, mode);
}
return Status::OK();
default:
return Status::Invalid("Unrecognized type");
}
Expand Down Expand Up @@ -158,6 +171,18 @@ static Status StructToFlatbuffer(FBB& fbb, const std::shared_ptr<DataType>& type
return Status::OK();
}

static Status UnionToFlatbuffer(FBB& fbb, const std::shared_ptr<DataType>& type,
std::vector<FieldOffset>* out_children, Offset* offset) {
auto union_type = std::dynamic_pointer_cast<UnionType>(type);
FieldOffset field;
for (int i = 0; i < union_type->num_children(); ++i) {
RETURN_NOT_OK(FieldToFlatbuffer(fbb, union_type->child(i), &field));
out_children->push_back(field);
}
*offset = flatbuf::CreateUnion(fbb).Union();
return Status::OK();
}

#define INT_TO_FB_CASE(BIT_WIDTH, IS_SIGNED) \
*out_type = flatbuf::Type_Int; \
*offset = IntToFlatbuffer(fbb, BIT_WIDTH, IS_SIGNED); \
Expand Down Expand Up @@ -208,6 +233,9 @@ static Status TypeToFlatbuffer(FBB& fbb, const std::shared_ptr<DataType>& type,
case Type::STRUCT:
*out_type = flatbuf::Type_Struct_;
return StructToFlatbuffer(fbb, type, children, offset);
case Type::UNION:
*out_type = flatbuf::Type_Union;
return UnionToFlatbuffer(fbb, type, children, offset);
default:
*out_type = flatbuf::Type_NONE; // Make clang-tidy happy
std::stringstream ss;
Expand Down
10 changes: 10 additions & 0 deletions cpp/src/arrow/type.cc
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,16 @@ std::string UnionType::ToString() const {
return s.str();
}

bool UnionType::Equals(const DataType* other) const {
if (!DataType::Equals(other)) {
return false;
}
const UnionType *union_type = dynamic_cast<const UnionType*>(other);
return union_type && type_id == union_type->type_id
&& std::equal(type_ids.begin(), type_ids.end(),
union_type->type_ids.begin());
}

int NullType::bit_width() const {
return 0;
}
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/arrow/type.h
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,11 @@ struct ARROW_EXPORT UnionType : public DataType {
static std::string name() { return "union"; }
Status Accept(TypeVisitor* visitor) const override;

bool Equals(const DataType* other) const override;
bool Equals(const std::shared_ptr<DataType>& other) const {
return Equals(other.get());
}

UnionMode mode;
std::vector<uint8_t> type_ids;
};
Expand Down
27 changes: 22 additions & 5 deletions cpp/src/arrow/types/primitive.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "arrow/util/bit-util.h"
#include "arrow/util/buffer.h"
#include "arrow/util/logging.h"
#include "arrow/util/memory-util.h"

namespace arrow {

Expand Down Expand Up @@ -98,7 +99,7 @@ Status PrimitiveBuilder<T>::Init(int32_t capacity) {
int64_t nbytes = TypeTraits<T>::bytes_required(capacity);
RETURN_NOT_OK(data_->Resize(nbytes));
// TODO(emkornfield) valgrind complains without this
memset(data_->mutable_data(), 0, nbytes);
// memset(data_->mutable_data(), 0, nbytes);

raw_data_ = reinterpret_cast<value_type*>(data_->mutable_data());
return Status::OK();
Expand All @@ -117,7 +118,12 @@ Status PrimitiveBuilder<T>::Resize(int32_t capacity) {
const int64_t new_bytes = TypeTraits<T>::bytes_required(capacity);
RETURN_NOT_OK(data_->Resize(new_bytes));
raw_data_ = reinterpret_cast<value_type*>(data_->mutable_data());
memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes);
if ((new_bytes - old_bytes) >= 32*KB) {
memset_page_aligned(
data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes);
} else {
memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes);
}
}
return Status::OK();
}
Expand All @@ -128,7 +134,13 @@ Status PrimitiveBuilder<T>::Append(
RETURN_NOT_OK(Reserve(length));

if (length > 0) {
memcpy(raw_data_ + length_, values, TypeTraits<T>::bytes_required(length));
size_t numbytes = TypeTraits<T>::bytes_required(length);
if (numbytes >= MB) {
memcopy_block_aligned((uint8_t *)(raw_data_ + length_),
(uint8_t *)values, numbytes);
} else {
memcpy(raw_data_ + length_, values, numbytes);
}
}

// length_ is update by these
Expand Down Expand Up @@ -172,7 +184,7 @@ Status BooleanBuilder::Init(int32_t capacity) {
int64_t nbytes = BitUtil::BytesForBits(capacity);
RETURN_NOT_OK(data_->Resize(nbytes));
// TODO(emkornfield) valgrind complains without this
memset(data_->mutable_data(), 0, nbytes);
//memset(data_->mutable_data(), 0, nbytes);

raw_data_ = reinterpret_cast<uint8_t*>(data_->mutable_data());
return Status::OK();
Expand All @@ -191,7 +203,12 @@ Status BooleanBuilder::Resize(int32_t capacity) {

RETURN_NOT_OK(data_->Resize(new_bytes));
raw_data_ = reinterpret_cast<uint8_t*>(data_->mutable_data());
memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes);
if ((new_bytes - old_bytes) >= 32*KB) {
memset_page_aligned(
data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes);
} else {
memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes);
}
}
return Status::OK();
}
Expand Down
47 changes: 46 additions & 1 deletion cpp/src/arrow/types/union.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,50 @@
#include <vector>

#include "arrow/type.h"
#include "arrow/util/status.h"

namespace arrow {} // namespace arrow
namespace arrow {

bool UnionArray::Equals(const std::shared_ptr<Array>& arr) const {
if (this == arr.get()) { return true; }
if (!arr) { return false; }
if (this->type_enum() != arr->type_enum()) { return false; }
if (null_count_ != arr->null_count()) { return false; }
return RangeEquals(0, length_, 0, arr);
}

bool UnionArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
const std::shared_ptr<Array>& arr) const {
if (this == arr.get()) { return true; }
if (Type::UNION != arr->type_enum()) { return false; }
const auto other = static_cast<UnionArray*>(arr.get());

// TODO(pcm): Handle sparse case here

int32_t i = start_idx;
int32_t o_i = other_start_idx;
for (size_t c = 0; c < other->children().size(); ++c) {
for (int32_t e = 0; e < other->children()[c]->length(); ++e) {
if (!children()[c]->RangeEquals(e, e + 1, e, other->children()[c])) { // FIXME(pcm): fix this
return false;
}
i += 1;
o_i += 1;
if (i >= end_idx) {
return true;
}
}
}
return false; // to make the compiler happy
}

Status UnionArray::Validate() const {
// TODO(pcm): what to do here?
return Status::OK();
}

Status UnionArray::Accept(ArrayVisitor* visitor) const {
return visitor->Visit(*this);
}

} // namespace arrow
Loading